srctree

Robin Linden parent 22691678 fd231f69
idna: Process ucd code point decomposition data into a C++ish table

This is needed for the NFC normalization we have to perform when doingthe UTS46 processing.

inlinesplit
WORKSPACE added: 114, removed: 4, total 110
@@ -250,6 +250,14 @@ http_archive(
url = "https://github.com/nothings/stb/archive/ae721c50eaf761660b4f90cc590453cdb0c2acd0.tar.gz",
)
 
# https://www.unicode.org/Public/
http_archive(
name = "ucd",
build_file_content = """exports_files(["UnicodeData.txt"])""",
integrity = "sha256-yxxmPQU5JlAM1QEilzYEV1JxOgZr11gCCYWYt6cFYXc=",
url = "https://www.unicode.org/Public/15.1.0/ucd/UCD.zip",
)
 
# https://github.com/illiliti/libudev-zero
http_archive(
name = "udev-zero", # ISC
 
idna/BUILD added: 114, removed: 4, total 110
@@ -15,13 +15,29 @@ genrule(
tools = [":idna_data_processor"],
)
 
py_binary(
name = "unicode_data_processor",
srcs = ["unicode_data_processor.py"],
)
 
genrule(
name = "generate_unicode_data",
srcs = ["@ucd//:UnicodeData.txt"],
outs = ["unicode_data.h"],
cmd = "$(location :unicode_data_processor) $(location @ucd//:UnicodeData.txt) >$@",
tools = [":unicode_data_processor"],
)
 
cc_library(
name = "idna",
srcs = glob(
include = ["*.cpp"],
exclude = ["*_test.cpp"],
),
hdrs = [":generate_idna_data"] + glob(["*.h"]),
hdrs = glob(["*.h"]) + [
":generate_idna_data",
":generate_unicode_data",
],
copts = HASTUR_COPTS,
visibility = ["//visibility:public"],
deps = ["//util:unicode"],
 
filename was Deleted added: 114, removed: 4, total 110
@@ -0,0 +1,86 @@
#!/usr/bin/env python3
 
# SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
#
# SPDX-License-Identifier: BSD-2-Clause
 
import dataclasses
import operator
import sys
import textwrap
import typing
 
 
@dataclasses.dataclass
class Decomposition:
code_point: int
decomposes_to: typing.List[int]
 
@staticmethod
def parse_from(line: str) -> "Decomposition":
code_point, decompositions = operator.itemgetter(0, 5)(line.split(";"))
return Decomposition(
int(code_point, base=16),
[int(n, base=16) for n in decompositions.split(" ")],
)
 
def to_cxx_class(self) -> str:
decomposition = "".join(f"\\U{c:08X}" for c in self.decomposes_to)
return f'{{{str(self.code_point)}, "{decomposition}"}}'
 
 
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
if __name__ == "__main__":
if len(sys.argv) != 2:
print(
f"Usage: {sys.argv[0]} <UnicodeData.txt>",
file=sys.stderr,
)
sys.exit(1)
 
with open(sys.argv[1]) as table:
lines = table.readlines()
 
# Filter out lines not containing decomposition info.
lines = [line for line in lines if line.split(";")[5].strip()]
 
# Filter out non-canonical decompositions.
lines = [line for line in lines if not line.split(";")[5].startswith("<")]
 
decompositions = [Decomposition.parse_from(line) for line in lines]
 
sys.stdout.buffer.write(
textwrap.dedent(
f"""\
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
// This file is generated. Do not touch it.
 
#ifndef IDNA_UNICODE_DATA_H_
#define IDNA_UNICODE_DATA_H_
// clang-format off
 
#include <array>
#include <string_view>
 
namespace idna::unicode {{
 
struct Decomposition {{
char32_t code_point{{}};
std::string_view decomposes_to{{}};
constexpr bool operator==(Decomposition const &) const = default;
}};
 
constexpr std::array<Decomposition, {len(decompositions)}> kDecompositions{{{{
{",\n ".join(d.to_cxx_class() for d in decompositions)}
}}}};
 
}} // namespace idna::unicode
 
// clang-format on
#endif
"""
).encode()
)