diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -47,6 +47,10 @@ GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3pmodsdot : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -219,7 +219,9 @@ bool SelectVOP3OMods(SDValue In, SDValue &Src, SDValue &Clamp, SDValue &Omod) const; - bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods, + bool IsDOT = false) const; + bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2673,7 +2673,7 @@ } bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, - SDValue &SrcMods) const { + SDValue &SrcMods, bool IsDOT) const { unsigned Mods = 0; Src = In; @@ -2682,7 +2682,8 @@ Src = Src.getOperand(0); } - if (Src.getOpcode() == ISD::BUILD_VECTOR) { + if (Src.getOpcode() == ISD::BUILD_VECTOR && + (!IsDOT || !Subtarget->hasDOTOpSelHazard())) { unsigned VecMods = Mods; SDValue Lo = stripBitcast(Src.getOperand(0)); @@ -2770,6 +2771,11 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + return SelectVOP3PMods(In, Src, SrcMods, true); +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -173,11 +173,15 @@ selectVOP3Mods_nnan(MachineOperand &Root) const; std::pair - selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const; + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, + bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsDOT(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3407,7 +3407,7 @@ std::pair AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register Src, const MachineRegisterInfo &MRI) const { + Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; MachineInstr *MI = MRI.getVRegDef(Src); @@ -3421,6 +3421,7 @@ } // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; @@ -3443,6 +3444,21 @@ }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -4047,6 +4047,20 @@ if (OpSel & ~3) return false; } + + if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) { + int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel); + if (OpSelIdx != -1) { + if (Inst.getOperand(OpSelIdx).getImm() != 0) + return false; + } + int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi); + if (OpSelHiIdx != -1) { + if (Inst.getOperand(OpSelHiIdx).getImm() != -1) + return false; + } + } + return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -964,6 +964,11 @@ return HasLdsBranchVmemWARHazard; } + // Cannot use op_sel with v_dot instructions. + bool hasDOTOpSelHazard() const { + return GFX940Insts; + } + bool hasNSAtoVMEMBug() const { return HasNSAtoVMEMBug; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -210,6 +210,8 @@ if (Fold.isImm()) { if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + (!ST.hasDOTOpSelHazard() || + !(MI->getDesc().TSFlags & SIInstrFlags::IsDOT)) && AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1428,6 +1428,8 @@ def VOP3PMods : ComplexPattern; +def VOP3PModsDOT : ComplexPattern; + def VOP3OpSel : ComplexPattern; def VOP3OpSelMods : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -32,10 +32,12 @@ ret1)); } -class getVOP3PModPat { - dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers)); - dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)); - dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers)); +class getVOP3PModPat { + dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers)); + dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers)); + dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers)); dag clamp_dag = (i1 timm:$clamp); list ret3 = [(set P.DstVT:$vdst, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -40,14 +40,13 @@ } multiclass VOP3PInst { + SDPatternOperator node = null_frag, bit IsDOT = 0> { def NAME : VOP3P_Pseudo.ret, + getVOP3PModPat.ret, getVOP3Pat.ret)>; } - // Non-packed instructions that use the VOP3P encoding. // VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. multiclass VOP3_VOP3PInst { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -1,4 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 +; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX9,GFX940 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 @@ -6,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp: -; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( i32 addrspace(1)* %r, @@ -23,7 +25,7 @@ } ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp: -; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX9: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( i32 addrspace(1)* %r, @@ -41,6 +43,7 @@ ; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_op_sel: ; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} +; GFX940: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}}{{$}} ; GFX10: v_dot2_u32_u16 v{{[0-9]+}}, 1, v{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1,0] op_sel_hi:[0,0,1]{{$}} define amdgpu_kernel void @test_llvm_amdgcn_udot2_op_sel( i32 addrspace(1)* %r, diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s --- a/llvm/test/MC/AMDGPU/gfx940_err.s +++ b/llvm/test/MC/AMDGPU/gfx940_err.s @@ -58,6 +58,9 @@ buffer_wbl2 scc // GFX940: error: invalid operand for instruction +v_dot2_u32_u16 v0, 1, v0, s2 op_sel:[0,1,0,1] op_sel_hi:[0,0,1,1] +// GFX940: error: invalid op_sel operand + s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO) // GFX940: error: specified hardware register is not supported on this GPU