srctree

Robin Linden parent 7375819d 48d24750
util/unicode: Add a helper for mapping UTF-8 text to code points

inlinesplit
util/unicode.h added: 89, removed: 2, total 87
@@ -8,6 +8,7 @@
#include <cstdint>
#include <optional>
#include <string>
#include <utility>
 
namespace util {
 
@@ -178,6 +179,67 @@ constexpr std::optional<std::size_t> utf8_length(std::string_view input) {
return len;
}
 
// TODO(robinlinden): Only allow use w/ valid UTF-8.
class CodePointView {
class CodePointIterator;
 
public:
constexpr explicit CodePointView(std::string_view utf8_data) : view_{std::move(utf8_data)} {}
 
constexpr CodePointIterator begin() const { return CodePointIterator{view_.begin()}; }
constexpr CodePointIterator end() const { return CodePointIterator{view_.end()}; }
 
private:
std::string_view view_;
 
class CodePointIterator {
public:
constexpr explicit CodePointIterator(std::string_view::const_iterator it) : it_{std::move(it)} {}
 
constexpr CodePointIterator &operator++() {
it_ += current_code_point_length();
return *this;
}
 
constexpr CodePointIterator operator++(int) {
auto copy = *this;
++*this;
return copy;
}
 
constexpr std::uint32_t operator*() const {
return utf8_to_utf32(std::string_view{it_, it_ + current_code_point_length()});
}
 
[[nodiscard]] constexpr bool operator==(CodePointIterator const &) const = default;
 
private:
static constexpr auto kTwoByteMask = 0b1100'0000;
static constexpr auto kThreeByteMask = 0b1110'0000;
static constexpr auto kFourByteMask = 0b1111'0000;
 
std::string_view::const_iterator it_;
 
constexpr int current_code_point_length() const {
auto const current = *it_;
 
if ((current & kFourByteMask) == kFourByteMask) {
return 4;
}
 
if ((current & kThreeByteMask) == kThreeByteMask) {
return 3;
}
 
if ((current & kTwoByteMask) == kTwoByteMask) {
return 2;
}
 
return 1;
}
};
};
 
} // namespace util
 
#endif
 
util/unicode_test.cpp added: 89, removed: 2, total 87
@@ -102,5 +102,30 @@ int main() {
expect_eq(utf8_length(invalid), std::nullopt);
});
 
etest::test("CodePointView", [] {
auto into_code_points = [](std::string_view s) {
std::vector<std::uint32_t> code_points{};
for (auto cp : CodePointView{s}) {
code_points.push_back(cp);
}
return code_points;
};
 
// 3x ROBOT FACE
expect_eq(into_code_points("🤖🤖🤖"sv), std::vector<std::uint32_t>{0x1f916, 0x1f916, 0x1f916});
 
// GOTHIC LETTER HWAIR.
expect_eq(into_code_points("\xf0\x90\x8d\x88"sv), std::vector<std::uint32_t>{0x10348});
 
// Boring ASCII.
expect_eq(into_code_points("abcd"sv), std::vector<std::uint32_t>{'a', 'b', 'c', 'd'});
 
// REGISTERED SIGN
expect_eq(into_code_points("\xc2\xae"sv), std::vector<std::uint32_t>{0xae});
 
// BUGINESE END OF SECTION
expect_eq(into_code_points("\xe1\xa8\x9f"sv), std::vector<std::uint32_t>{0x1a1f});
});
 
return etest::run_all_tests();
}