diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4169,7 +4169,9 @@ return false; } - if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) { + uint64_t TSFlags = MII.get(Opc).TSFlags; + + if (isGFX940() && (TSFlags & SIInstrFlags::IsDOT)) { int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); if (OpSelIdx != -1) { if (Inst.getOperand(OpSelIdx).getImm() != 0) @@ -4182,6 +4184,15 @@ } } + // op_sel[0:1] must be 0 for v_dot2_bf16_bf16 and v_dot2_f16_f16 (VOP3 Dot). + if ((TSFlags & SIInstrFlags::IsDOT) && (TSFlags & SIInstrFlags::VOP3) && + !(TSFlags & SIInstrFlags::VOP3P)) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + unsigned OpSel = Inst.getOperand(OpSelIdx).getImm(); + if (OpSel & 3) + return false; + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -760,15 +760,19 @@ } class VOP3_DOT_Profile : VOP3_Profile { - // FIXME VOP3 DPP versions are unsupported - let HasExtVOP3DPP = 0; let HasClamp = 0; let HasOMod = 0; - let InsVOP3OpSel = getInsVOP3OpSel.ret, FPVRegInputMods, IntOpSelMods), - !if(isFloatType.ret, FPVRegInputMods, IntOpSelMods), - !if(isFloatType.ret, FPVRegInputMods, IntOpSelMods)>.ret; + // Override modifiers for bf16(i16) (same as float modifiers). + let HasSrc0Mods = 1; + let HasSrc1Mods = 1; + let HasSrc2Mods = 1; + let Src0ModDPP = FPVRegInputMods; + let Src1ModDPP = FPVRegInputMods; + let Src2ModVOP3DPP = FPVRegInputMods; + let InsVOP3OpSel = getInsVOP3OpSel.ret; + let AsmVOP3OpSel = getAsmVOP3OpSel.ret; } let SubtargetPredicate = isGFX11Plus in { @@ -784,7 +788,7 @@ defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile>; } // End SubtargetPredicate = isGFX11Plus -let SubtargetPredicate = HasDot8Insts in { +let SubtargetPredicate = HasDot8Insts, IsDOT=1 in { defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile, int_amdgcn_fdot2_f16_f16>; defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile, int_amdgcn_fdot2_bf16_bf16>; } @@ -909,9 +913,8 @@ defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>; defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>; defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>; -// FIXME VOP3 DPP Dot instructions are unsupported -defm V_DOT2_F16_F16 : VOP3_Real_Base_gfx11<0x266>; -defm V_DOT2_BF16_BF16 : VOP3_Real_Base_gfx11<0x267>; +defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>; +defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -269,6 +269,10 @@ class VOP3OpSel_gfx11 op, VOPProfile p> : VOP3OpSel_gfx10; +class VOP3DotOpSel_gfx11 op, VOPProfile p> : VOP3OpSel_gfx11{ + let Inst{11} = ?; + let Inst{12} = ?; +} // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa class VOP3Interp_vi op, VOPProfile P> : VOP3e_vi { @@ -1271,6 +1275,7 @@ class Base_VOP3_DPP16 op, VOP_DPP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP { let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -1287,6 +1292,7 @@ class Base_VOP3_DPP8 op, VOP_Pseudo ps, string opName = ps.OpName> : VOP3_DPP8 { let VOP3_OPSEL = ps.Pfl.HasOpSel; + let IsDOT = ps.IsDOT; let hasSideEffects = ps.hasSideEffects; let Defs = ps.Defs; let SchedRW = ps.SchedRW; @@ -1328,6 +1334,15 @@ VOP3e_gfx11; } } + multiclass VOP3Dot_Real_Base_gfx11 op, string opName = NAME, + bit isSingle = 0> { + defvar ps = !cast(opName#"_e64"); + let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in { + def _e64_gfx11 : + VOP3_Real, + VOP3DotOpSel_gfx11; + } + } multiclass VOP3_Real_with_name_gfx11 op, string opName, string asmName, bit isSingle = 0> { defvar ps = !cast(opName#"_e64"); @@ -1357,6 +1372,15 @@ let DecoderNamespace = "DPPGFX11"; } } + + multiclass VOP3Dot_Real_dpp_Base_gfx11 op, string opName = NAME> { + def _e64_dpp_gfx11 : VOP3_DPP16(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> { + let Inst{11} = ?; + let Inst{12} = ?; + let DecoderNamespace = "DPPGFX11"; + } + } + multiclass VOP3_Real_dpp_with_name_gfx11 op, string opName, string asmName> { defvar ps = !cast(opName#"_e64"); @@ -1370,6 +1394,16 @@ let DecoderNamespace = "DPP8GFX11"; } } + + multiclass VOP3Dot_Real_dpp8_Base_gfx11 op, string opName = NAME> { + defvar ps = !cast(opName#"_e64"); + def _e64_dpp8_gfx11 : Base_VOP3_DPP8 { + let Inst{11} = ?; + let Inst{12} = ?; + let DecoderNamespace = "DPP8GFX11"; + } + } + multiclass VOP3_Real_dpp8_with_name_gfx11 op, string opName, string asmName> { defvar ps = !cast(opName#"_e64"); @@ -1408,6 +1442,12 @@ VOP3_Real_dpp_Base_gfx11, VOP3_Real_dpp8_Base_gfx11; +multiclass VOP3Dot_Realtriple_gfx11 op, + bit isSingle = 0, string opName = NAME> : + VOP3Dot_Real_Base_gfx11, + VOP3Dot_Real_dpp_Base_gfx11, + VOP3Dot_Real_dpp8_Base_gfx11; + multiclass VOP3Only_Realtriple_gfx11 op> : VOP3_Realtriple_gfx11; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11 declare i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a, <2 x i16> %b, i16 %c) -define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16( -; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16: +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16( +; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 @@ -30,3 +30,47 @@ store i16 %r.val, i16 addrspace(1)* %r ret void } + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp( +; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: +; SDAG-GFX11: ; %bb.0: ; %entry +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 +; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 +; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1 +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-GFX11-NEXT: s_endpgm +; +; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp: +; GISEL-GFX11: ; %bb.0: ; %entry +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 +; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 +; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3 +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-GFX11-NEXT: s_endpgm + i16 addrspace(5)* %r, + <2 x i16> addrspace(5)* %a, + <2 x i16> addrspace(5)* %b, + i16 addrspace(5)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(5)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(5)* %b + %c.val = load i16, i16 addrspace(5)* %c + %a.val.i32 = bitcast <2 x i16> %a.val to i32 + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a.val.i32, i32 %a.val.i32, i32 1, i32 15, i32 15, i1 1) + %a.val.dpp.v2i16 = bitcast i32 %dpp to <2 x i16> + %r.val = call i16 @llvm.amdgcn.fdot2.bf16.bf16(<2 x i16> %a.val.dpp.v2i16, <2 x i16> %b.val, i16 %c.val) + store i16 %r.val, i16 addrspace(5)* %r + ret void +} + +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,SDAG-GFX11 +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11,GISEL-GFX11 declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c) @@ -30,3 +30,47 @@ store half %r.val, half addrspace(1)* %r ret void } + +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp( +; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: +; SDAG-GFX11: ; %bb.0: ; %entry +; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2 +; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3 +; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1 +; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-GFX11-NEXT: s_endpgm +; +; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp: +; GISEL-GFX11: ; %bb.0: ; %entry +; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1 +; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2 +; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3 +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0 +; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-GFX11-NEXT: s_endpgm + half addrspace(5)* %r, + <2 x half> addrspace(5)* %a, + <2 x half> addrspace(5)* %b, + half addrspace(5)* %c) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(5)* %a + %b.val = load <2 x half>, <2 x half> addrspace(5)* %b + %c.val = load half, half addrspace(5)* %c + %a.val.i32 = bitcast <2 x half> %a.val to i32 + %dpp = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a.val.i32, i32 %a.val.i32, i32 1, i32 15, i32 15, i1 1) + %a.val.dpp.v2half = bitcast i32 %dpp to <2 x half> + %r.val = call half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a.val.dpp.v2half, <2 x half> %b.val, half %c.val) + store half %r.val, half addrspace(5)* %r + ret void +} + +declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_dpp16.s --- a/llvm/test/MC/AMDGPU/gfx11_asm_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_dpp16.s @@ -1,4 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error %s v_mov_b32 v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 // GFX11: encoding: [0xfa,0x02,0x0a,0x7e,0x01,0x1b,0x00,0x00] @@ -632,3 +633,27 @@ v_movrelsd_b32 v0, v255 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0 // GFX11: encoding: [0xfa,0x88,0x00,0x7e,0xff,0x1b,0x00,0x00] + +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] + +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand + +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] + +v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00] + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] + +v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX11: encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_dpp8.s --- a/llvm/test/MC/AMDGPU/gfx11_asm_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_dpp8.s @@ -1,4 +1,5 @@ -// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1100 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX11-ERR --implicit-check-not=error %s v_mov_b32 v5, v1 dpp8:[0,1,2,3,4,5,6,7] // GFX11: encoding: [0xe9,0x02,0x0a,0x7e,0x01,0x88,0xc6,0xfa] @@ -519,3 +520,27 @@ v_subrev_nc_u32 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX11: encoding: [0xea,0x04,0x0a,0x4e,0x01,0x77,0x39,0x05] +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] + +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand + +v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] + +v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92] + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[1,1,0,0] dpp8:[0,1,2,3,4,4,4,4] +// GFX11-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid op_sel operand + +v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] + +v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] +// GFX11: encoding: [0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92] + diff --git a/llvm/test/MC/AMDGPU/gfx11_vop123.s b/llvm/test/MC/AMDGPU/gfx11_vop123.s --- a/llvm/test/MC/AMDGPU/gfx11_vop123.s +++ b/llvm/test/MC/AMDGPU/gfx11_vop123.s @@ -4271,15 +4271,29 @@ v_dot2_f16_f16 v0, v1, v2, v3 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0x01,0x05,0x0e,0x04] +v_dot2_f16_f16 v0, v1, v2, v3 op_sel:[1,1,0,0] +// W32-ERR: error: invalid op_sel operand +// W64-ERR: error: invalid op_sel operand + v_dot2_f16_f16 v0, v1, v2, v3 op_sel:[0,0,1,1] // GFX11: encoding: [0x00,0x60,0x66,0xd6,0x01,0x05,0x0e,0x04] +v_dot2_f16_f16 v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] +// GFX11: encoding: [0x00,0x65,0x66,0xd6,0x01,0x05,0x0e,0xc4] + v_dot2_bf16_bf16 v0, v1, v2, v3 // GFX11: encoding: [0x00,0x00,0x67,0xd6,0x01,0x05,0x0e,0x04] +v_dot2_bf16_bf16 v0, v1, v2, v3 op_sel:[1,1,0,0] +// W32-ERR: error: invalid op_sel operand +// W64-ERR: error: invalid op_sel operand + v_dot2_bf16_bf16 v0, v1, v2, v3 op_sel:[0,0,1,1] // GFX11: encoding: [0x00,0x60,0x67,0xd6,0x01,0x05,0x0e,0x04] +v_dot2_bf16_bf16 v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] +// GFX11: encoding: [0x00,0x65,0x67,0xd6,0x01,0x05,0x0e,0xc4] + v_dot2c_f32_f16_e32 v5, v1, v2 // GFX11: encoding: [0x01,0x05,0x0a,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_all.txt @@ -14538,15 +14538,29 @@ # GFX11: v_dot2_f16_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x66,0xd6,0x01,0x05,0x0e,0x04] 0x00,0x00,0x66,0xd6,0x01,0x05,0x0e,0x04 +# op_sel[1:0] are ignored +# GFX11: v_dot2_f16_f16 v0, v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x00,0x60,0x66,0xd6,0x01,0x05,0x0e,0x04] +0x00,0x78,0x66,0xd6,0x01,0x05,0x0e,0x04 + # GFX11: v_dot2_f16_f16 v0, v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x00,0x60,0x66,0xd6,0x01,0x05,0x0e,0x04] 0x00,0x60,0x66,0xd6,0x01,0x05,0x0e,0x04 +# GFX11: v_dot2_f16_f16 v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] ; encoding: [0x00,0x65,0x66,0xd6,0x01,0x05,0x0e,0xc4] +0x00,0x65,0x66,0xd6,0x01,0x05,0x0e,0xc4 + # GFX11: v_dot2_bf16_bf16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0x67,0xd6,0x01,0x05,0x0e,0x04] 0x00,0x00,0x67,0xd6,0x01,0x05,0x0e,0x04 +# op_sel[1:0] are ignored +# GFX11: v_dot2_bf16_bf16 v0, v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x00,0x60,0x67,0xd6,0x01,0x05,0x0e,0x04] +0x00,0x78,0x67,0xd6,0x01,0x05,0x0e,0x04 + # GFX11: v_dot2_bf16_bf16 v0, v1, v2, v3 op_sel:[0,0,1,1] ; encoding: [0x00,0x60,0x67,0xd6,0x01,0x05,0x0e,0x04] 0x00,0x60,0x67,0xd6,0x01,0x05,0x0e,0x04 +# GFX11: v_dot2_bf16_bf16 v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] ; encoding: [0x00,0x65,0x67,0xd6,0x01,0x05,0x0e,0xc4] +0x00,0x65,0x67,0xd6,0x01,0x05,0x0e,0xc4 + # GFX11: v_dot2acc_f32_f16 v5, v1, v2 ; encoding: [0x01,0x05,0x0a,0x04] 0x01,0x05,0x0a,0x04 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt @@ -14098,3 +14098,29 @@ # GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30] 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# op_sel[1:0] are ignored +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x78,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x60,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00] +0x00,0x65,0x66,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x00,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# op_sel[1:0] are ignored +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x78,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00] +0x00,0x60,0x67,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00] +0x00,0x65,0x67,0xd6,0xfa,0x04,0x0e,0xc4,0x01,0xe4,0x04,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt @@ -5266,3 +5266,29 @@ # GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00] 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# op_sel[1:0] are ignored +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x78,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x60,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# GFX11: v_dot2_f16_f16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92] +0x00,0x65,0x66,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x00,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# op_sel[1:0] are ignored +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x78,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, v1, v2, v3 op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92] +0x00,0x60,0x67,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92 + +# GFX11: v_dot2_bf16_bf16_e64_dpp v0, |v1|, -v2, -|v3| op_sel:[0,0,1,1] dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92] +0x00,0x65,0x67,0xd6,0xe9,0x04,0x0e,0xc4,0x01,0x88,0x46,0x92