srctree

Robin Linden parent fd231f69 6e5458ca
idna: Add a helper for applying NFD-normalization to a string

inline split

filename was Deleted added: 112, removed: 3, total 109

@@ -0,0 +1,55 @@

// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#include "idna/unicode.h"

#include "idna/unicode_data.h"

#include "util/unicode.h"

// NOLINTNEXTLINE(misc-include-cleaner): This is used for std::ranges::lower_bound.

#include <algorithm>

#include <ostream>

#include <sstream>

#include <string>

#include <string_view>

#include <utility>

namespace idna {

namespace {

void decompose_to(std::ostream &os, char32_t code_point) {

// * clang-tidy thinks std::ranges::lower_bound is provided by

// <bits/ranges_algo.h> when it's actually provided by <algorithm>.

// * clang-tidy says this is pointer-ish, but msvc disagrees.

// NOLINTNEXTLINE(misc-include-cleaner,readability-qualified-auto)

auto maybe_decomposition = std::ranges::lower_bound(

unicode::kDecompositions, code_point, {}, &decltype(unicode::kDecompositions)::value_type::code_point);

// This code point does not decompose.

if (maybe_decomposition->code_point != code_point) {

os << util::unicode_to_utf8(code_point);

return;

}

// Recursively decompose the decomposition. This is needed as some code

// points decompose into code points that also decompose.

for (auto const decomposed : util::CodePointView{maybe_decomposition->decomposes_to}) {

decompose_to(os, decomposed);

}

} // namespace

std::string Unicode::decompose(std::string_view input) {

std::stringstream ss{};

for (auto const code_point : util::CodePointView{input}) {

decompose_to(ss, code_point);

}

return std::move(ss).str();

}

} // namespace idna

filename was Deleted added: 112, removed: 3, total 109

@@ -0,0 +1,21 @@

// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#ifndef IDNA_UNICODE_H_

#define IDNA_UNICODE_H_

#include <string>

#include <string_view>

namespace idna {

class Unicode {

public:

// Normalizes the input into its canonical decomposition, NFD.

static std::string decompose(std::string_view);

};

} // namespace idna

#endif

filename was Deleted added: 112, removed: 3, total 109

@@ -0,0 +1,33 @@

// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

#include "idna/unicode.h"

#include "etest/etest2.h"

int main() {

etest::Suite s{};

s.add_test("not decomposed", [](etest::IActions &a) {

a.expect_eq(idna::Unicode::decompose("abc123xyz"), "abc123xyz"); //

});

s.add_test("decomposed", [](etest::IActions &a) {

// A + COMBINING RING ABOVE

a.expect_eq(idna::Unicode::decompose("Å"), "A\xcc\x8a");

// s + COMBINING DOT BELOW + COMBINING DOT ABOVE

a.expect_eq(idna::Unicode::decompose("ṩ"), "s\xcc\xa3\xcc\x87");

});

s.add_test("mixed", [](etest::IActions &a) {

// s + COMBINING DOT BELOW + COMBINING DOT ABOVE

a.expect_eq(idna::Unicode::decompose("123ṩ567"),

"123"

"s\xcc\xa3\xcc\x87"

"567");

});

return s.run();

}