srctree

Robin Linden parent 22691678 fd231f69
idna: Process ucd code point decomposition data into a C++ish table

This is needed for the NFC normalization we have to perform when doingthe UTS46 processing.

inline split

WORKSPACE added: 114, removed: 4, total 110

@@ -250,6 +250,14 @@ http_archive(

url = "https://github.com/nothings/stb/archive/ae721c50eaf761660b4f90cc590453cdb0c2acd0.tar.gz",

)

# https://www.unicode.org/Public/

http_archive(

name = "ucd",

build_file_content = """exports_files(["UnicodeData.txt"])""",

integrity = "sha256-yxxmPQU5JlAM1QEilzYEV1JxOgZr11gCCYWYt6cFYXc=",

url = "https://www.unicode.org/Public/15.1.0/ucd/UCD.zip",

)

# https://github.com/illiliti/libudev-zero

http_archive(

name = "udev-zero", # ISC

idna/BUILD added: 114, removed: 4, total 110

@@ -15,13 +15,29 @@ genrule(

tools = [":idna_data_processor"],

)

py_binary(

name = "unicode_data_processor",

srcs = ["unicode_data_processor.py"],

)

genrule(

name = "generate_unicode_data",

srcs = ["@ucd//:UnicodeData.txt"],

outs = ["unicode_data.h"],

cmd = "$(location :unicode_data_processor) $(location @ucd//:UnicodeData.txt) >$@",

tools = [":unicode_data_processor"],

)

cc_library(

name = "idna",

srcs = glob(

include = ["*.cpp"],

exclude = ["*_test.cpp"],

hdrs = [":generate_idna_data"] + glob(["*.h"]),

hdrs = glob(["*.h"]) + [

":generate_idna_data",

":generate_unicode_data",

copts = HASTUR_COPTS,

visibility = ["//visibility:public"],

deps = ["//util:unicode"],

filename was Deleted added: 114, removed: 4, total 110

@@ -0,0 +1,86 @@

#!/usr/bin/env python3

# SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>

# SPDX-License-Identifier: BSD-2-Clause

import dataclasses

import operator

import sys

import textwrap

import typing

@dataclasses.dataclass

class Decomposition:

code_point: int

decomposes_to: typing.List[int]

@staticmethod

def parse_from(line: str) -> "Decomposition":

code_point, decompositions = operator.itemgetter(0, 5)(line.split(";"))

return Decomposition(

int(code_point, base=16),

[int(n, base=16) for n in decompositions.split(" ")],

)

def to_cxx_class(self) -> str:

decomposition = "".join(f"\\U{c:08X}" for c in self.decomposes_to)

return f'{{{str(self.code_point)}, "{decomposition}"}}'

# https://www.unicode.org/reports/tr44/#UnicodeData.txt

if __name__ == "__main__":

if len(sys.argv) != 2:

print(

f"Usage: {sys.argv[0]} <UnicodeData.txt>",

file=sys.stderr,

)

sys.exit(1)

with open(sys.argv[1]) as table:

lines = table.readlines()

# Filter out lines not containing decomposition info.

lines = [line for line in lines if line.split(";")[5].strip()]

# Filter out non-canonical decompositions.

lines = [line for line in lines if not line.split(";")[5].startswith("<")]

decompositions = [Decomposition.parse_from(line) for line in lines]

sys.stdout.buffer.write(

textwrap.dedent(

f"""\

// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>

// SPDX-License-Identifier: BSD-2-Clause

// This file is generated. Do not touch it.

#ifndef IDNA_UNICODE_DATA_H_

#define IDNA_UNICODE_DATA_H_

// clang-format off

#include <array>

#include <string_view>

namespace idna::unicode {{

struct Decomposition {{

char32_t code_point{{}};

std::string_view decomposes_to{{}};

constexpr bool operator==(Decomposition const &) const = default;

}};

constexpr std::array<Decomposition, {len(decompositions)}> kDecompositions{{{{

{",\n ".join(d.to_cxx_class() for d in decompositions)}

}}}};

}} // namespace idna::unicode

// clang-format on

#endif

"""

).encode()

)