srctree

Ryan Liptak parent b344ff01 4ee1309a
std.unicode: Refactor and add WTF-16/WTF-8 functions

Renamed functions for consistent Le capitalization and conventions:

utf16leToUtf8Alloc -> utf16LeToUtf8Alloc
utf16leToUtf8AllocZ -> utf16LeToUtf8AllocZ
utf16leToUtf8 -> utf16LeToUtf8
utf8ToUtf16LeWithNull -> utf8ToUtf16LeAllocZ
fmtUtf16le -> fmtUtf16Le
New UTF related functions:
utf16LeToUtf8ArrayList
utf8ToUtf16LeArrayList
utf8ToUtf16LeAlloc
isSurrogateCodepoint
(the ArrayList functions are mostly to allow the Alloc and AllocZ to share an implementation)
New WTF related functions/structs:
wtf8Encode
wtf8Decode
wtf8ValidateSlice
Wtf8View
Wtf8Iterator
wtf16LeToWtf8ArrayList
wtf16LeToWtf8Alloc
wtf16LeToWtf8AllocZ
wtf16LeToWtf8
wtf8ToWtf16LeArrayList
wtf8ToWtf16LeAlloc
wtf8ToWtf16LeAllocZ
wtf8ToWtf16Le
wtf8ToUtf8Lossy
wtf8ToUtf8LossyAlloc
wtf8ToUtf8LossyAllocZ
Wtf16LeIterator

inline split

lib/std/unicode.zig added: 783, removed: 121, total 662

@@ -39,7 +39,16 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {

/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).

/// Errors: if c cannot be encoded in UTF-8.

/// Returns: the number of bytes written to out.

pub fn utf8Encode(c: u21, out: []u8) !u3 {

pub fn utf8Encode(c: u21, out: []u8) error{ Utf8CannotEncodeSurrogateHalf, CodepointTooLarge }!u3 {

return utf8EncodeImpl(c, out, .cannot_encode_surrogate_half);

}

const Surrogates = enum {

cannot_encode_surrogate_half,

can_encode_surrogate_half,

};

fn utf8EncodeImpl(c: u21, out: []u8, comptime surrogates: Surrogates) !u3 {

const length = try utf8CodepointSequenceLength(c);

assert(out.len >= length);

switch (length) {

@@ -53,7 +62,9 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {

out[1] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));

3 => {

if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;

if (surrogates == .cannot_encode_surrogate_half and isSurrogateCodepoint(c)) {

return error.Utf8CannotEncodeSurrogateHalf;

}

out[0] = @as(u8, @intCast(0b11100000 | (c >> 12)));

out[1] = @as(u8, @intCast(0b10000000 | ((c >> 6) & 0b111111)));

out[2] = @as(u8, @intCast(0b10000000 | (c & 0b111111)));

@@ -116,12 +127,22 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {

return value;

}

const Utf8Decode3Error = error{

Utf8ExpectedContinuation,

Utf8OverlongEncoding,

const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{

Utf8EncodesSurrogateHalf,

};

pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {

const value = try utf8Decode3AllowSurrogateHalf(bytes);

if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;

return value;

}

const Utf8Decode3AllowSurrogateHalfError = error{

Utf8ExpectedContinuation,

Utf8OverlongEncoding,

};

pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {

assert(bytes.len == 3);

assert(bytes[0] & 0b11110000 == 0b11100000);

var value: u21 = bytes[0] & 0b00001111;

@@ -135,7 +156,6 @@ pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {

value |= bytes[2] & 0b00111111;

if (value < 0x800) return error.Utf8OverlongEncoding;

if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;

return value;

}

@@ -213,6 +233,10 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {

/// Returns true if the input consists entirely of UTF-8 codepoints

pub fn utf8ValidateSlice(input: []const u8) bool {

return utf8ValidateSliceImpl(input, .cannot_encode_surrogate_half);

}

fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {

var remaining = input;

const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;

@@ -240,9 +264,15 @@ pub fn utf8ValidateSlice(input: []const u8) bool {

const xx = 0xF1; // invalid: size 1

const as = 0xF0; // ASCII: size 1

const s1 = 0x02; // accept 0, size 2

const s2 = 0x13; // accept 1, size 3

const s2 = switch (surrogates) {

.cannot_encode_surrogate_half => 0x13, // accept 1, size 3

.can_encode_surrogate_half => 0x03, // accept 0, size 3

};

const s3 = 0x03; // accept 0, size 3

const s4 = 0x23; // accept 2, size 3

const s4 = switch (surrogates) {

.cannot_encode_surrogate_half => 0x23, // accept 2, size 3

.can_encode_surrogate_half => 0x03, // accept 0, size 3

};

const s5 = 0x34; // accept 3, size 4

const s6 = 0x04; // accept 0, size 4

const s7 = 0x44; // accept 4, size 4

@@ -770,94 +800,93 @@ fn testDecode(bytes: []const u8) !u21 {

return utf8Decode(bytes);

}

fn utf16LeToUtf8ArrayListImpl(array_list: *std.ArrayList(u8), utf16le: []const u16, comptime surrogates: Surrogates) !void {

// optimistically guess that it will all be ascii.

try array_list.ensureTotalCapacityPrecise(utf16le.len);

var remaining = utf16le;

if (builtin.zig_backend != .stage2_x86_64) {

const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;

const Chunk = @Vector(chunk_len, u16);

// Fast path. Check for and encode ASCII characters at the start of the input.

while (remaining.len >= chunk_len) {

const chunk: Chunk = remaining[0..chunk_len].*;

const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));

if (@reduce(.Or, chunk | mask != mask)) {

// found a non ASCII code unit

break;

}

const chunk_byte_len = chunk_len * 2;

const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;

const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);

const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];

// We allocated enough space to encode every UTF-16 code unit

// as ASCII, so if the entire string is ASCII then we are

// guaranteed to have enough space allocated

array_list.appendSliceAssumeCapacity(&ascii_bytes);

remaining = remaining[chunk_len..];

}

var out_index: usize = array_list.items.len;

switch (surrogates) {

.cannot_encode_surrogate_half => {

var it = Utf16LeIterator.init(remaining);

while (try it.nextCodepoint()) |codepoint| {

const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;

try array_list.resize(array_list.items.len + utf8_len);

assert((utf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);

out_index += utf8_len;

}

.can_encode_surrogate_half => {

var it = Wtf16LeIterator.init(remaining);

while (it.nextCodepoint()) |codepoint| {

const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;

try array_list.resize(array_list.items.len + utf8_len);

assert((wtf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);

out_index += utf8_len;

}

pub fn utf16LeToUtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) !void {

return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .cannot_encode_surrogate_half);

}

/// Deprecated; renamed to utf16LeToUtf8Alloc

pub const utf16leToUtf8Alloc = utf16LeToUtf8Alloc;

/// Caller must free returned memory.

pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8 {

pub fn utf16LeToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8 {

// optimistically guess that it will all be ascii.

var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);

errdefer result.deinit();

var remaining = utf16le;

if (builtin.zig_backend != .stage2_x86_64) {

const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;

const Chunk = @Vector(chunk_len, u16);

// Fast path. Check for and encode ASCII characters at the start of the input.

while (remaining.len >= chunk_len) {

const chunk: Chunk = remaining[0..chunk_len].*;

const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));

if (@reduce(.Or, chunk | mask != mask)) {

// found a non ASCII code unit

break;

}

const chunk_byte_len = chunk_len * 2;

const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;

const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);

const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];

// We allocated enough space to encode every UTF-16 code unit

// as ASCII, so if the entire string is ASCII then we are

// guaranteed to have enough space allocated

result.appendSliceAssumeCapacity(&ascii_bytes);

remaining = remaining[chunk_len..];

}

var out_index: usize = result.items.len;

var it = Utf16LeIterator.init(remaining);

while (try it.nextCodepoint()) |codepoint| {

const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;

try result.resize(result.items.len + utf8_len);

assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len);

out_index += utf8_len;

}

try utf16LeToUtf8ArrayList(&result, utf16le);

return result.toOwnedSlice();

}

/// Deprecated; renamed to utf16LeToUtf8AllocZ

pub const utf16leToUtf8AllocZ = utf16LeToUtf8AllocZ;

/// Caller must free returned memory.

pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]u8 {

pub fn utf16LeToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]u8 {

// optimistically guess that it will all be ascii (and allocate space for the null terminator)

var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);

errdefer result.deinit();

var remaining = utf16le;

if (builtin.zig_backend != .stage2_x86_64) {

const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;

const Chunk = @Vector(chunk_len, u16);

try utf16LeToUtf8ArrayList(&result, utf16le);

// Fast path. Check for and encode ASCII characters at the start of the input.

while (remaining.len >= chunk_len) {

const chunk: Chunk = remaining[0..chunk_len].*;

const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));

if (@reduce(.Or, chunk | mask != mask)) {

// found a non ASCII code unit

break;

}

const chunk_byte_len = chunk_len * 2;

const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;

const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);

const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];

// We allocated enough space to encode every UTF-16 code unit

// as ASCII, so if the entire string is ASCII then we are

// guaranteed to have enough space allocated

result.appendSliceAssumeCapacity(&ascii_bytes);

remaining = remaining[chunk_len..];

}

var out_index = result.items.len;

var it = Utf16LeIterator.init(remaining);

while (try it.nextCodepoint()) |codepoint| {

const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;

try result.resize(result.items.len + utf8_len);

assert((utf8Encode(codepoint, result.items[out_index..]) catch unreachable) == utf8_len);

out_index += utf8_len;

}

return result.toOwnedSliceSentinel(0);

}

/// Asserts that the output buffer is big enough.

/// Returns end byte index into utf8.

pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {

fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surrogates) !usize {

var end_index: usize = 0;

var remaining = utf16le;

@@ -883,30 +912,56 @@ pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {

}

var it = Utf16LeIterator.init(remaining);

while (try it.nextCodepoint()) |codepoint| {

end_index += try utf8Encode(codepoint, utf8[end_index..]);

switch (surrogates) {

.cannot_encode_surrogate_half => {

var it = Utf16LeIterator.init(remaining);

while (try it.nextCodepoint()) |codepoint| {

end_index += utf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {

// The maximum possible codepoint encoded by UTF-16 is U+10FFFF,

// which is within the valid codepoint range.

error.CodepointTooLarge => unreachable,

else => |e| return e,

};

}

.can_encode_surrogate_half => {

var it = Wtf16LeIterator.init(remaining);

while (it.nextCodepoint()) |codepoint| {

end_index += wtf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {

// The maximum possible codepoint encoded by UTF-16 is U+10FFFF,

// which is within the valid codepoint range.

error.CodepointTooLarge => unreachable,

};

}

return end_index;

}

test "utf16leToUtf8" {

/// Deprecated; renamed to utf16LeToUtf8

pub const utf16leToUtf8 = utf16LeToUtf8;

pub fn utf16LeToUtf8(utf8: []u8, utf16le: []const u16) !usize {

return utf16LeToUtf8Impl(utf8, utf16le, .cannot_encode_surrogate_half);

}

test utf16LeToUtf8 {

var utf16le: [2]u16 = undefined;

const utf16le_as_bytes = mem.sliceAsBytes(utf16le[0..]);

{

mem.writeInt(u16, utf16le_as_bytes[0..2], 'A', .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 'a', .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "Aa"));

}

{

mem.writeInt(u16, utf16le_as_bytes[0..2], 0x80, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xffff, .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));

}

@@ -914,8 +969,8 @@ test "utf16leToUtf8" {

// the values just outside the surrogate half range

mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd7ff, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xe000, .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));

}

@@ -923,8 +978,8 @@ test "utf16leToUtf8" {

// smallest surrogate pair

mem.writeInt(u16, utf16le_as_bytes[0..2], 0xd800, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));

}

@@ -932,31 +987,30 @@ test "utf16leToUtf8" {

// largest surrogate pair

mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdfff, .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));

}

{

mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdbff, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdc00, .little);

const utf8 = try utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

defer std.testing.allocator.free(utf8);

const utf8 = try utf16LeToUtf8Alloc(testing.allocator, &utf16le);

defer testing.allocator.free(utf8);

try testing.expect(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));

}

{

mem.writeInt(u16, utf16le_as_bytes[0..2], 0xdcdc, .little);

mem.writeInt(u16, utf16le_as_bytes[2..4], 0xdcdc, .little);

const result = utf16leToUtf8Alloc(std.testing.allocator, &utf16le);

try std.testing.expectError(error.UnexpectedSecondSurrogateHalf, result);

const result = utf16LeToUtf8Alloc(testing.allocator, &utf16le);

try testing.expectError(error.UnexpectedSecondSurrogateHalf, result);

}

pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u16 {

fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {

// optimistically guess that it will not require surrogate pairs

var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len + 1);

errdefer result.deinit();

try array_list.ensureTotalCapacityPrecise(utf8.len);

var remaining = utf8;

// Need support for std.simd.interlace

@@ -974,26 +1028,54 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1

}

const zeroes: Chunk = @splat(0);

const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });

result.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk));

array_list.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk));

remaining = remaining[chunk_len..];

}

const view = try Utf8View.init(remaining);

const view = switch (surrogates) {

.cannot_encode_surrogate_half => try Utf8View.init(remaining),

.can_encode_surrogate_half => try Wtf8View.init(remaining),

};

var it = view.iterator();

while (it.nextCodepoint()) |codepoint| {

if (codepoint < 0x10000) {

const short = @as(u16, @intCast(codepoint));

try result.append(mem.nativeToLittle(u16, short));

try array_list.append(mem.nativeToLittle(u16, short));

} else {

const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;

const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;

var out: [2]u16 = undefined;

out[0] = mem.nativeToLittle(u16, high);

out[1] = mem.nativeToLittle(u16, low);

try result.appendSlice(out[0..]);

try array_list.appendSlice(out[0..]);

}

pub fn utf8ToUtf16LeArrayList(array_list: *std.ArrayList(u16), utf8: []const u8) !void {

return utf8ToUtf16LeArrayListImpl(array_list, utf8, .cannot_encode_surrogate_half);

}

pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) ![]u16 {

// optimistically guess that it will not require surrogate pairs

var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len);

errdefer result.deinit();

try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);

return result.toOwnedSlice();

}

/// Deprecated; renamed to utf8ToUtf16LeAllocZ

pub const utf8ToUtf16LeWithNull = utf8ToUtf16LeAllocZ;

pub fn utf8ToUtf16LeAllocZ(allocator: mem.Allocator, utf8: []const u8) ![:0]u16 {

// optimistically guess that it will not require surrogate pairs

var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len + 1);

errdefer result.deinit();

try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);

return result.toOwnedSliceSentinel(0);

}

@@ -1001,6 +1083,10 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1

/// Returns index of next character. If exact fit, returned index equals output slice length.

/// Assumes there is enough space for the output.

pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {

return utf8ToUtf16LeImpl(utf16le, utf8, .cannot_encode_surrogate_half);

}

pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize {

var dest_i: usize = 0;

var remaining = utf8;

@@ -1029,7 +1115,10 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {

while (src_i < remaining.len) {

const n = utf8ByteSequenceLength(remaining[src_i]) catch return error.InvalidUtf8;

const next_src_i = src_i + n;

const codepoint = utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8;

const codepoint = switch (surrogates) {

.cannot_encode_surrogate_half => utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8,

.can_encode_surrogate_half => wtf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8,

};

if (codepoint < 0x10000) {

const short = @as(u16, @intCast(codepoint));

utf16le[dest_i] = mem.nativeToLittle(u16, short);

@@ -1064,21 +1153,59 @@ test "utf8ToUtf16Le" {

}

test "utf8ToUtf16LeWithNull" {

test utf8ToUtf16LeArrayList {

{

const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "𐐷");

var list = std.ArrayList(u16).init(testing.allocator);

defer list.deinit();

try utf8ToUtf16LeArrayList(&list, "𐐷");

try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(list.items));

}

{

var list = std.ArrayList(u16).init(testing.allocator);

defer list.deinit();

try utf8ToUtf16LeArrayList(&list, "\u{10FFFF}");

try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(list.items));

}

{

var list = std.ArrayList(u16).init(testing.allocator);

defer list.deinit();

const result = utf8ToUtf16LeArrayList(&list, "\xf4\x90\x80\x80");

try testing.expectError(error.InvalidUtf8, result);

}

test utf8ToUtf16LeAlloc {

{

const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "𐐷");

defer testing.allocator.free(utf16);

try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));

}

{

const utf16 = try utf8ToUtf16LeAlloc(testing.allocator, "\u{10FFFF}");

defer testing.allocator.free(utf16);

try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));

}

{

const result = utf8ToUtf16LeAlloc(testing.allocator, "\xf4\x90\x80\x80");

try testing.expectError(error.InvalidUtf8, result);

}

test utf8ToUtf16LeAllocZ {

{

const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷");

defer testing.allocator.free(utf16);

try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));

try testing.expect(utf16[2] == 0);

}

{

const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "\u{10FFFF}");

const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}");

defer testing.allocator.free(utf16);

try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));

try testing.expect(utf16[2] == 0);

}

{

const result = utf8ToUtf16LeWithNull(testing.allocator, "\xf4\x90\x80\x80");

const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80");

try testing.expectError(error.InvalidUtf8, result);

}

@@ -1127,8 +1254,9 @@ test "calculate utf16 string length of given utf8 string in u16" {

try comptime testCalcUtf16LeLen();

}

/// Print the given `utf16le` string

fn formatUtf16le(

/// Print the given `utf16le` string, encoded as UTF-8 bytes.

/// Unpaired surrogates are replaced by the replacement character (U+FFFD).

fn formatUtf16Le(

utf16le: []const u16,

comptime fmt: []const u8,

options: std.fmt.FormatOptions,

@@ -1150,22 +1278,25 @@ fn formatUtf16le(

try writer.writeAll(buf[0..u8len]);

}

/// Deprecated; renamed to fmtUtf16Le

pub const fmtUtf16le = fmtUtf16Le;

/// Return a Formatter for a Utf16le string

pub fn fmtUtf16le(utf16le: []const u16) std.fmt.Formatter(formatUtf16le) {

pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Formatter(formatUtf16Le) {

return .{ .data = utf16le };

}

test "fmtUtf16le" {

const expectFmt = std.testing.expectFmt;

try expectFmt("", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral(""))});

try expectFmt("foo", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral("foo"))});

try expectFmt("𐐷", "{}", .{fmtUtf16le(utf8ToUtf16LeStringLiteral("𐐷"))});

try expectFmt("퟿", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\xff\xd7", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\x00\xd8", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\xff\xdb", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\x00\xdc", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\xff\xdf", native_endian)})});

try expectFmt("", "{}", .{fmtUtf16le(&[_]u16{std.mem.readInt(u16, "\x00\xe0", native_endian)})});

test "fmtUtf16Le" {

const expectFmt = testing.expectFmt;

try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});

try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});

try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))});

try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xd7", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xd8", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdb", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xdc", native_endian)})});

try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdf", native_endian)})});

try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xe0", native_endian)})});

}

test "utf8ToUtf16LeStringLiteral" {

@@ -1248,3 +1379,534 @@ test "utf8 valid codepoint" {

try testUtf8ValidCodepoint();

try comptime testUtf8ValidCodepoint();

}

/// Returns true if the codepoint is a surrogate (U+DC00 to U+DFFF)

pub fn isSurrogateCodepoint(c: u21) bool {

return switch (c) {

0xD800...0xDFFF => true,

else => false,

};

}

/// Encodes the given codepoint into a WTF-8 byte sequence.

/// c: the codepoint.

/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).

/// Errors: if c cannot be encoded in WTF-8.

/// Returns: the number of bytes written to out.

pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {

return utf8EncodeImpl(c, out, .can_encode_surrogate_half);

}

const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;

pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {

return switch (bytes.len) {

1 => @as(u21, bytes[0]),

2 => utf8Decode2(bytes),

3 => utf8Decode3AllowSurrogateHalf(bytes),

4 => utf8Decode4(bytes),

else => unreachable,

};

}

/// Returns true if the input consists entirely of WTF-8 codepoints

/// (all the same restrictions as UTF-8, but allows surrogate codepoints

/// U+D800 to U+DFFF).

/// Does not check for well-formed WTF-8, meaning that this function

/// does not check that all surrogate halves are unpaired.

pub fn wtf8ValidateSlice(input: []const u8) bool {

return utf8ValidateSliceImpl(input, .can_encode_surrogate_half);

}

test "validate WTF-8 slice" {

try testValidateWtf8Slice();

try comptime testValidateWtf8Slice();

// We skip a variable (based on recommended vector size) chunks of

// ASCII characters. Let's make sure we're chunking correctly.

const str = [_]u8{'a'} ** 550 ++ "\xc0";

for (0..str.len - 3) |i| {

try testing.expect(!wtf8ValidateSlice(str[i..]));

}

fn testValidateWtf8Slice() !void {

// These are valid/invalid under both UTF-8 and WTF-8 rules.

try testing.expect(wtf8ValidateSlice("abc"));

try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));

try testing.expect(wtf8ValidateSlice(""));

try testing.expect(wtf8ValidateSlice("a"));

try testing.expect(wtf8ValidateSlice("abc"));

try testing.expect(wtf8ValidateSlice("Ж"));

try testing.expect(wtf8ValidateSlice("ЖЖ"));

try testing.expect(wtf8ValidateSlice("брэд-ЛГТМ"));

try testing.expect(wtf8ValidateSlice("☺☻☹"));

try testing.expect(wtf8ValidateSlice("a\u{fffdb}"));

try testing.expect(wtf8ValidateSlice("\xf4\x8f\xbf\xbf"));

try testing.expect(wtf8ValidateSlice("abc\xdf\xbf"));

try testing.expect(!wtf8ValidateSlice("abc\xc0"));

try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));

try testing.expect(!wtf8ValidateSlice("aa\xe2"));

try testing.expect(!wtf8ValidateSlice("\x42\xfa"));

try testing.expect(!wtf8ValidateSlice("\x42\xfa\x43"));

try testing.expect(!wtf8ValidateSlice("abc\xc0"));

try testing.expect(!wtf8ValidateSlice("abc\xc0abc"));

try testing.expect(!wtf8ValidateSlice("\xf4\x90\x80\x80"));

try testing.expect(!wtf8ValidateSlice("\xf7\xbf\xbf\xbf"));

try testing.expect(!wtf8ValidateSlice("\xfb\xbf\xbf\xbf\xbf"));

try testing.expect(!wtf8ValidateSlice("\xc0\x80"));

// But surrogate codepoints are only valid in WTF-8.

try testing.expect(wtf8ValidateSlice("\xed\xa0\x80"));

try testing.expect(wtf8ValidateSlice("\xed\xbf\xbf"));

}

/// Wtf8View iterates the code points of a WTF-8 encoded string,

/// including surrogate halves.

///

/// ```

/// var wtf8 = (try std.unicode.Wtf8View.init("hi there")).iterator();

/// while (wtf8.nextCodepointSlice()) |codepoint| {

/// // note: codepoint could be a surrogate half which is invalid

/// // UTF-8, avoid printing or otherwise sending/emitting this directly

/// }

/// ```

pub const Wtf8View = struct {

bytes: []const u8,

pub fn init(s: []const u8) !Wtf8View {

if (!wtf8ValidateSlice(s)) {

return error.InvalidUtf8;

}

return initUnchecked(s);

}

pub fn initUnchecked(s: []const u8) Wtf8View {

return Wtf8View{ .bytes = s };

}

pub inline fn initComptime(comptime s: []const u8) Wtf8View {

return comptime if (init(s)) |r| r else |err| switch (err) {

error.InvalidUtf8 => {

@compileError("invalid utf8 detected in wtf8 string");

};

}

pub fn iterator(s: Wtf8View) Wtf8Iterator {

return Wtf8Iterator{

.bytes = s.bytes,

.i = 0,

};

}

};

/// Asserts that `bytes` is valid WTF-8

pub const Wtf8Iterator = struct {

bytes: []const u8,

i: usize,

pub fn nextCodepointSlice(it: *Wtf8Iterator) ?[]const u8 {

if (it.i >= it.bytes.len) {

return null;

}

const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;

it.i += cp_len;

return it.bytes[it.i - cp_len .. it.i];

}

pub fn nextCodepoint(it: *Wtf8Iterator) ?u21 {

const slice = it.nextCodepointSlice() orelse return null;

return wtf8Decode(slice) catch unreachable;

}

/// Look ahead at the next n codepoints without advancing the iterator.

/// If fewer than n codepoints are available, then return the remainder of the string.

pub fn peek(it: *Wtf8Iterator, n: usize) []const u8 {

const original_i = it.i;

defer it.i = original_i;

var end_ix = original_i;

var found: usize = 0;

while (found < n) : (found += 1) {

const next_codepoint = it.nextCodepointSlice() orelse return it.bytes[original_i..];

end_ix += next_codepoint.len;

}

return it.bytes[original_i..end_ix];

}

};

pub fn wtf16LeToWtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) !void {

return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .can_encode_surrogate_half);

}

/// Caller must free returned memory.

pub fn wtf16LeToWtf8Alloc(allocator: mem.Allocator, wtf16le: []const u16) ![]u8 {

// optimistically guess that it will all be ascii.

var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len);

errdefer result.deinit();

try wtf16LeToWtf8ArrayList(&result, wtf16le);

return result.toOwnedSlice();

}

/// Caller must free returned memory.

pub fn wtf16LeToWtf8AllocZ(allocator: mem.Allocator, wtf16le: []const u16) ![:0]u8 {

// optimistically guess that it will all be ascii (and allocate space for the null terminator)

var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len + 1);

errdefer result.deinit();

try wtf16LeToWtf8ArrayList(&result, wtf16le);

return result.toOwnedSliceSentinel(0);

}

pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize {

return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {};

}

pub fn wtf8ToWtf16LeArrayList(array_list: *std.ArrayList(u16), wtf8: []const u8) !void {

return utf8ToUtf16LeArrayListImpl(array_list, wtf8, .can_encode_surrogate_half);

}

pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) ![]u16 {

// optimistically guess that it will not require surrogate pairs

var result = try std.ArrayList(u16).initCapacity(allocator, wtf8.len);

errdefer result.deinit();

try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);

return result.toOwnedSlice();

}

pub fn wtf8ToWtf16LeAllocZ(allocator: mem.Allocator, wtf8: []const u8) ![:0]u16 {

// optimistically guess that it will not require surrogate pairs

var result = try std.ArrayList(u16).initCapacity(allocator, wtf8.len + 1);

errdefer result.deinit();

try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);

return result.toOwnedSliceSentinel(0);

}

/// Returns index of next character. If exact fit, returned index equals output slice length.

/// Assumes there is enough space for the output.

pub fn wtf8ToWtf16Le(wtf16le: []u16, wtf8: []const u8) !usize {

return utf8ToUtf16LeImpl(wtf16le, wtf8, .can_encode_surrogate_half);

}

/// Surrogate codepoints (U+D800 to U+DFFF) are replaced by the Unicode replacement

/// character (U+FFFD).

/// All surrogate codepoints and the replacement character are encoded as three

/// bytes, meaning the input and output slices will always be the same length.

/// In-place conversion is supported when `utf8` and `wtf8` refer to the same slice.

/// Note: If `wtf8` is entirely composed of well-formed UTF-8, then no conversion is necessary.

/// `utf8ValidateSlice` can be used to check if lossy conversion is worthwhile.

pub fn wtf8ToUtf8Lossy(utf8: []u8, wtf8: []const u8) !void {

assert(utf8.len >= wtf8.len);

const in_place = utf8.ptr == wtf8.ptr;

const replacement_char_bytes = comptime blk: {

var buf: [3]u8 = undefined;

assert((utf8Encode(replacement_character, &buf) catch unreachable) == 3);

break :blk buf;

};

var dest_i: usize = 0;

const view = try Wtf8View.init(wtf8);

var it = view.iterator();

while (it.nextCodepointSlice()) |codepoint_slice| {

// All surrogate codepoints are encoded as 3 bytes

if (codepoint_slice.len == 3) {

const codepoint = wtf8Decode(codepoint_slice) catch unreachable;

if (isSurrogateCodepoint(codepoint)) {

@memcpy(utf8[dest_i..][0..replacement_char_bytes.len], &replacement_char_bytes);

dest_i += replacement_char_bytes.len;

continue;

}

if (!in_place) {

@memcpy(utf8[dest_i..][0..codepoint_slice.len], codepoint_slice);

}

dest_i += codepoint_slice.len;

}

pub fn wtf8ToUtf8LossyAlloc(allocator: mem.Allocator, wtf8: []const u8) ![]u8 {

const utf8 = try allocator.alloc(u8, wtf8.len);

errdefer allocator.free(utf8);

try wtf8ToUtf8Lossy(utf8, wtf8);

return utf8;

}

pub fn wtf8ToUtf8LossyAllocZ(allocator: mem.Allocator, wtf8: []const u8) ![:0]u8 {

const utf8 = try allocator.allocSentinel(u8, wtf8.len, 0);

errdefer allocator.free(utf8);

try wtf8ToUtf8Lossy(utf8, wtf8);

return utf8;

}

test wtf8ToUtf8Lossy {

var buf: [32]u8 = undefined;

const invalid_utf8 = "\xff";

try testing.expectError(error.InvalidWtf8, wtf8ToUtf8Lossy(&buf, invalid_utf8));

const ascii = "abcd";

try wtf8ToUtf8Lossy(&buf, ascii);

try testing.expectEqualStrings("abcd", buf[0..ascii.len]);

const high_surrogate_half = "ab\xed\xa0\xbdcd";

try wtf8ToUtf8Lossy(&buf, high_surrogate_half);

try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..high_surrogate_half.len]);

const low_surrogate_half = "ab\xed\xb2\xa9cd";

try wtf8ToUtf8Lossy(&buf, low_surrogate_half);

try testing.expectEqualStrings("ab\u{FFFD}cd", buf[0..low_surrogate_half.len]);

// If the WTF-8 is not well-formed, each surrogate half is converted into a separate

// replacement character instead of being interpreted as a surrogate pair.

const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";

try wtf8ToUtf8Lossy(&buf, encoded_surrogate_pair);

try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", buf[0..encoded_surrogate_pair.len]);

// in place

@memcpy(buf[0..low_surrogate_half.len], low_surrogate_half);

const slice = buf[0..low_surrogate_half.len];

try wtf8ToUtf8Lossy(slice, slice);

try testing.expectEqualStrings("ab\u{FFFD}cd", slice);

}

test wtf8ToUtf8LossyAlloc {

const invalid_utf8 = "\xff";

try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAlloc(testing.allocator, invalid_utf8));

{

const ascii = "abcd";

const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, ascii);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("abcd", utf8);

}

{

const surrogate_half = "ab\xed\xa0\xbdcd";

const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, surrogate_half);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);

}

{

// If the WTF-8 is not well-formed, each surrogate half is converted into a separate

// replacement character instead of being interpreted as a surrogate pair.

const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";

const utf8 = try wtf8ToUtf8LossyAlloc(testing.allocator, encoded_surrogate_pair);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);

}

test wtf8ToUtf8LossyAllocZ {

const invalid_utf8 = "\xff";

try testing.expectError(error.InvalidWtf8, wtf8ToUtf8LossyAllocZ(testing.allocator, invalid_utf8));

{

const ascii = "abcd";

const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, ascii);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("abcd", utf8);

}

{

const surrogate_half = "ab\xed\xa0\xbdcd";

const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, surrogate_half);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("ab\u{FFFD}cd", utf8);

}

{

// If the WTF-8 is not well-formed, each surrogate half is converted into a separate

// replacement character instead of being interpreted as a surrogate pair.

const encoded_surrogate_pair = "ab\xed\xa0\xbd\xed\xb2\xa9cd";

const utf8 = try wtf8ToUtf8LossyAllocZ(testing.allocator, encoded_surrogate_pair);

defer testing.allocator.free(utf8);

try testing.expectEqualStrings("ab\u{FFFD}\u{FFFD}cd", utf8);

}

pub const Wtf16LeIterator = struct {

bytes: []const u8,

i: usize,

pub fn init(s: []const u16) Wtf16LeIterator {

return Wtf16LeIterator{

.bytes = std.mem.sliceAsBytes(s),

.i = 0,

};

}

/// If the next codepoint is encoded by a surrogate pair, returns the

/// codepoint that the surrogate pair represents.

/// If the next codepoint is an unpaired surrogate, returns the codepoint

/// of the unpaired surrogate.

pub fn nextCodepoint(it: *Wtf16LeIterator) ?u21 {

assert(it.i <= it.bytes.len);

if (it.i == it.bytes.len) return null;

var code_units: [2]u16 = undefined;

code_units[0] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);

it.i += 2;

surrogate_pair: {

if (utf16IsHighSurrogate(code_units[0])) {

if (it.i >= it.bytes.len) break :surrogate_pair;

code_units[1] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);

const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair;

it.i += 2;

return codepoint;

}

return code_units[0];

}

};

test "non-well-formed WTF-8 does not roundtrip" {

// This encodes the surrogate pair U+D83D U+DCA9.

// The well-formed version of this would be U+1F4A9 which is \xF0\x9F\x92\xA9.

const non_well_formed_wtf8 = "\xed\xa0\xbd\xed\xb2\xa9";

var wtf16_buf: [2]u16 = undefined;

const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, non_well_formed_wtf8);

const wtf16 = wtf16_buf[0..wtf16_len];

try testing.expectEqualSlices(u16, &[_]u16{

mem.nativeToLittle(u16, 0xD83D), // high surrogate

mem.nativeToLittle(u16, 0xDCA9), // low surrogate

}, wtf16);

var wtf8_buf: [4]u8 = undefined;

const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16);

const wtf8 = wtf8_buf[0..wtf8_len];

// Converting to WTF-16 and back results in well-formed WTF-8,

// but it does not match the input WTF-8

try testing.expectEqualSlices(u8, "\xf0\x9f\x92\xa9", wtf8);

}

fn testRoundtripWtf8(wtf8: []const u8) !void {

// Buffer

{

var wtf16_buf: [32]u16 = undefined;

const wtf16_len = try wtf8ToWtf16Le(&wtf16_buf, wtf8);

const wtf16 = wtf16_buf[0..wtf16_len];

var roundtripped_buf: [32]u8 = undefined;

const roundtripped_len = wtf16LeToWtf8(&roundtripped_buf, wtf16);

const roundtripped = roundtripped_buf[0..roundtripped_len];

try testing.expectEqualSlices(u8, wtf8, roundtripped);

}

// Alloc

{

const wtf16 = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);

defer testing.allocator.free(wtf16);

const roundtripped = try wtf16LeToWtf8Alloc(testing.allocator, wtf16);

defer testing.allocator.free(roundtripped);

try testing.expectEqualSlices(u8, wtf8, roundtripped);

}

// AllocZ

{

const wtf16 = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);

defer testing.allocator.free(wtf16);

const roundtripped = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16);

defer testing.allocator.free(roundtripped);

try testing.expectEqualSlices(u8, wtf8, roundtripped);

}

test "well-formed WTF-8 roundtrips" {

try testRoundtripWtf8("\xed\x9f\xbf"); // not a surrogate half

try testRoundtripWtf8("\xed\xa0\xbd"); // high surrogate

try testRoundtripWtf8("\xed\xb2\xa9"); // low surrogate

try testRoundtripWtf8("\xed\xa0\xbd \xed\xb2\xa9"); // <high surrogate><space><low surrogate>

try testRoundtripWtf8("\xed\xa0\x80\xed\xaf\xbf"); // <high surrogate><high surrogate>

try testRoundtripWtf8("\xed\xa0\x80\xee\x80\x80"); // <high surrogate><not surrogate>

try testRoundtripWtf8("\xed\x9f\xbf\xed\xb0\x80"); // <not surrogate><low surrogate>

try testRoundtripWtf8("a\xed\xb0\x80"); // <not surrogate><low surrogate>

try testRoundtripWtf8("\xf0\x9f\x92\xa9"); // U+1F4A9, encoded as a surrogate pair in WTF-16

}

fn testRoundtripWtf16(wtf16le: []const u16) !void {

// Buffer

{

var wtf8_buf: [32]u8 = undefined;

const wtf8_len = wtf16LeToWtf8(&wtf8_buf, wtf16le);

const wtf8 = wtf8_buf[0..wtf8_len];

var roundtripped_buf: [32]u16 = undefined;

const roundtripped_len = try wtf8ToWtf16Le(&roundtripped_buf, wtf8);

const roundtripped = roundtripped_buf[0..roundtripped_len];

try testing.expectEqualSlices(u16, wtf16le, roundtripped);

}

// Alloc

{

const wtf8 = try wtf16LeToWtf8Alloc(testing.allocator, wtf16le);

defer testing.allocator.free(wtf8);

const roundtripped = try wtf8ToWtf16LeAlloc(testing.allocator, wtf8);

defer testing.allocator.free(roundtripped);

try testing.expectEqualSlices(u16, wtf16le, roundtripped);

}

// AllocZ

{

const wtf8 = try wtf16LeToWtf8AllocZ(testing.allocator, wtf16le);

defer testing.allocator.free(wtf8);

const roundtripped = try wtf8ToWtf16LeAllocZ(testing.allocator, wtf8);

defer testing.allocator.free(roundtripped);

try testing.expectEqualSlices(u16, wtf16le, roundtripped);

}

test "well-formed WTF-16 roundtrips" {

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xD83D), // high surrogate

std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xD83D), // high surrogate

std.mem.nativeToLittle(u16, ' '), // not surrogate

std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xD800), // high surrogate

std.mem.nativeToLittle(u16, 0xDBFF), // high surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xD800), // high surrogate

std.mem.nativeToLittle(u16, 0xE000), // not surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xD7FF), // not surrogate

std.mem.nativeToLittle(u16, 0xDC00), // low surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0x61), // not surrogate

std.mem.nativeToLittle(u16, 0xDC00), // low surrogate

});

try testRoundtripWtf16(&[_]u16{

std.mem.nativeToLittle(u16, 0xDC00), // low surrogate

});

}