srctree

Robin Linden parent e7181ac3 b4084f95
unicode: Port the unicode data processor to C++

Using Python here made things slower and more complex than just keeping it in C++ like the rest of the code base.
unicode/BUILD added: 95, removed: 93, total 2
@@ -1,10 +1,11 @@
load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
load("@rules_python//python:defs.bzl", "py_binary")
load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
load("//bzl:copts.bzl", "HASTUR_COPTS")
 
py_binary(
cc_binary(
name = "unicode_data_processor",
srcs = ["unicode_data_processor.py"],
srcs = ["unicode_data_processor.cpp"],
copts = HASTUR_COPTS,
deps = ["//util:string"],
)
 
genrule(
 
filename was Deleted added: 95, removed: 93, total 2
@@ -0,0 +1,87 @@
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
#include "util/string.h"
 
#include <charconv>
#include <cstdint>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <system_error>
 
// https://www.unicode.org/reports/tr44/#UnicodeData.txt
int main(int argc, char **argv) {
if (argc != 2) {
auto const *bin_name = argv[0] != nullptr ? argv[0] : "<bin>";
std::cerr << "Usage: " << bin_name << " <UnicodeData.txt>\n";
return 1;
}
 
std::ifstream table{argv[1]};
if (!table) {
std::cerr << "Unable to open " << argv[1] << " for reading\n";
return 1;
}
 
std::cout << R"(// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
// This file is generated. Do not touch it.
 
#ifndef UNICODE_UNICODE_DATA_H_
#define UNICODE_UNICODE_DATA_H_
// clang-format off
 
#include <array>
#include <string_view>
 
namespace unicode::generated {
 
struct Decomposition {
char32_t code_point{};
std::string_view decomposes_to{};
constexpr bool operator==(Decomposition const &) const = default;
};
 
constexpr auto kDecompositions = std::to_array<Decomposition>({
)";
 
for (std::string line{}; std::getline(table, line);) {
// Filter out lines not containing decomposition info.
auto fields = util::split(line, ";");
if (fields.size() < 6) {
continue;
}
 
// Filter out non-canonical decompositions.
auto decomposition_field = fields[5];
if (decomposition_field.empty() || decomposition_field[0] == '<') {
continue;
}
 
std::cout << " {0x" << fields[0] << ", \"";
 
auto decompositions = util::split(decomposition_field, " ");
for (auto const &decomp : decompositions) {
std::uint32_t code_point{};
auto res = std::from_chars(decomp.data(), decomp.data() + decomp.size(), code_point, 16);
if (res.ec != std::errc{} || res.ptr != decomp.data() + decomp.size()) {
std::cerr << "Invalid code point: " << decomp << '\n';
return 1;
}
 
std::cout << "\\U" << std::hex << std::setw(8) << std::setfill('0') << code_point;
}
 
std::cout << "\"},\n";
}
 
std::cout << "});\n\n"
<< "} // namespace unicode::generated\n\n"
<< "// clang-format on\n"
<< "#endif\n";
}
 
ev/null added: 95, removed: 93, total 2
@@ -1,86 +0,0 @@
#!/usr/bin/env python3
 
# SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
#
# SPDX-License-Identifier: BSD-2-Clause
 
import dataclasses
import operator
import sys
import textwrap
import typing
 
 
@dataclasses.dataclass
class Decomposition:
code_point: int
decomposes_to: typing.List[int]
 
@staticmethod
def parse_from(line: str) -> "Decomposition":
code_point, decompositions = operator.itemgetter(0, 5)(line.split(";"))
return Decomposition(
int(code_point, base=16),
[int(n, base=16) for n in decompositions.split(" ")],
)
 
def to_cxx_class(self) -> str:
decomposition = "".join(f"\\U{c:08X}" for c in self.decomposes_to)
return f'{{{str(self.code_point)}, "{decomposition}"}}'
 
 
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
if __name__ == "__main__":
if len(sys.argv) != 2:
print(
f"Usage: {sys.argv[0]} <UnicodeData.txt>",
file=sys.stderr,
)
sys.exit(1)
 
with open(sys.argv[1]) as table:
lines = table.readlines()
 
# Filter out lines not containing decomposition info.
lines = [line for line in lines if line.split(";")[5].strip()]
 
# Filter out non-canonical decompositions.
lines = [line for line in lines if not line.split(";")[5].startswith("<")]
 
decompositions = [Decomposition.parse_from(line) for line in lines]
 
sys.stdout.buffer.write(
textwrap.dedent(
f"""\
// SPDX-FileCopyrightText: 2024 Robin Lindén <dev@robinlinden.eu>
//
// SPDX-License-Identifier: BSD-2-Clause
 
// This file is generated. Do not touch it.
 
#ifndef UNICODE_UNICODE_DATA_H_
#define UNICODE_UNICODE_DATA_H_
// clang-format off
 
#include <array>
#include <string_view>
 
namespace unicode::generated {{
 
struct Decomposition {{
char32_t code_point{{}};
std::string_view decomposes_to{{}};
constexpr bool operator==(Decomposition const &) const = default;
}};
 
constexpr std::array<Decomposition, {len(decompositions)}> kDecompositions{{{{
{",\n ".join(d.to_cxx_class() for d in decompositions)}
}}}};
 
}} // namespace unicode::generated
 
// clang-format on
#endif
"""
).encode()
)