srctree

Jacob Young parent 2fcb2f59 513c4c14
x86_64: fix avx2 `@truncacte`

inlinesplit
src/arch/x86_64/CodeGen.zig added: 43, removed: 18, total 25
@@ -3274,8 +3274,8 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
break :dst dst_mcv;
} else dst: {
const dst_mcv = try self.allocRegOrMem(inst, true);
try self.genCopy(dst_ty, dst_mcv, src_mcv, .{});
const dst_mcv = try self.allocRegOrMemAdvanced(src_ty, inst, true);
try self.genCopy(src_ty, dst_mcv, src_mcv, .{});
break :dst dst_mcv;
};
 
@@ -3333,22 +3333,40 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void {
else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) },
};
 
const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size);
const dst_reg = dst_mcv.getReg().?;
const dst_alias = registerAlias(dst_reg, src_abi_size);
if (self.hasFeature(.avx)) {
try self.asmRegisterRegisterMemory(
.{ .vp_, .@"and" },
dst_reg,
dst_reg,
dst_alias,
dst_alias,
try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
);
try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg);
if (src_abi_size > 16) {
const temp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse);
const temp_lock = self.register_manager.lockRegAssumeUnused(temp_reg);
defer self.register_manager.unlockReg(temp_lock);
 
try self.asmRegisterRegisterImmediate(
.{ if (self.hasFeature(.avx2)) .v_i128 else .v_f128, .extract },
registerAlias(temp_reg, dst_abi_size),
dst_alias,
Immediate.u(1),
);
try self.asmRegisterRegisterRegister(
mir_tag,
registerAlias(dst_reg, dst_abi_size),
registerAlias(dst_reg, dst_abi_size),
registerAlias(temp_reg, dst_abi_size),
);
} else try self.asmRegisterRegisterRegister(mir_tag, dst_alias, dst_alias, dst_alias);
} else {
try self.asmRegisterMemory(
.{ .p_, .@"and" },
dst_reg,
dst_alias,
try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)),
);
try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg);
try self.asmRegisterRegister(mir_tag, dst_alias, dst_alias);
}
break :result dst_mcv;
}
@@ -16404,7 +16422,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
},
65...128 => switch (vector_len) {
else => null,
1...2 => .{ .vp_i128, .broadcast },
1...2 => .{ .v_i128, .broadcast },
},
}) orelse break :avx2;
 
@@ -16418,7 +16436,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))),
try src_mcv.mem(self, self.memSize(scalar_ty)),
) else {
if (mir_tag[0] == .vp_i128) break :avx2;
if (mir_tag[0] == .v_i128) break :avx2;
try self.genSetReg(dst_reg, scalar_ty, src_mcv, .{});
try self.asmRegisterRegister(
mir_tag,
 
src/arch/x86_64/Encoding.zig added: 43, removed: 18, total 25
@@ -415,7 +415,8 @@ pub const Mnemonic = enum {
vfmadd132sd, vfmadd213sd, vfmadd231sd,
vfmadd132ss, vfmadd213ss, vfmadd231ss,
// AVX2
vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw,
vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
vextracti128, vinserti128,
// zig fmt: on
};
 
 
src/arch/x86_64/Mir.zig added: 43, removed: 18, total 25
@@ -230,6 +230,8 @@ pub const Inst = struct {
v_d,
/// VEX-Encoded ___ QuadWord
v_q,
/// VEX-Encoded ___ Integer Data
v_i128,
/// VEX-Encoded Packed ___
vp_,
/// VEX-Encoded Packed ___ Byte
@@ -242,8 +244,6 @@ pub const Inst = struct {
vp_q,
/// VEX-Encoded Packed ___ Double Quadword
vp_dq,
/// VEX-Encoded Packed ___ Integer Data
vp_i128,
/// VEX-Encoded ___ Scalar Single-Precision Values
v_ss,
/// VEX-Encoded ___ Packed Single-Precision Values
@@ -654,6 +654,7 @@ pub const Inst = struct {
/// Variable blend scalar double-precision floating-point values
blendv,
/// Extract packed floating-point values
/// Extract packed integer values
extract,
/// Insert scalar single-precision floating-point value
/// Insert packed floating-point values
@@ -696,6 +697,7 @@ pub const Inst = struct {
sha256rnds2,
 
/// Load with broadcast floating-point data
/// Load integer and broadcast
broadcast,
 
/// Convert 16-bit floating-point values to single-precision floating-point values
 
src/arch/x86_64/encodings.zig added: 43, removed: 18, total 25
@@ -1769,6 +1769,10 @@ pub const table = [_]Entry{
.{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
.{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },
 
.{ .vextracti128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x39 }, 0, .vex_256_w0, .avx2 },
 
.{ .vinserti128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x38 }, 0, .vex_256_w0, .avx2 },
 
.{ .vpabsb, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1c }, 0, .vex_256_wig, .avx2 },
.{ .vpabsd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1e }, 0, .vex_256_wig, .avx2 },
.{ .vpabsw, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .vex_256_wig, .avx2 },
@@ -1809,7 +1813,7 @@ pub const table = [_]Entry{
.{ .vpbroadcastd, .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 },
.{ .vpbroadcastq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 },
.{ .vpbroadcastq, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 },
.{ .vpbroadcasti128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
.{ .vbroadcasti128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
 
.{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 },
.{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 },