Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -753,6 +753,9 @@ [llvm_i1_ty], [IntrNoMem, IntrSpeculatable, IntrConvergent] >; +// If false, set EXEC=0 for the current thread until the end of program. +def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -167,6 +167,12 @@ [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; +def COND_EQ_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOEQ;}]>; +def COND_NE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETONE;}]>; +def COND_GT_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOGT;}]>; +def COND_GE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOGE;}]>; +def COND_LT_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOLT;}]>; +def COND_LE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOLE;}]>; def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2431,7 +2431,7 @@ if (SplitPoint == BB->end()) { // Don't bother with a new block. - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return BB; } @@ -2445,7 +2445,7 @@ SplitBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(SplitBB); - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return SplitBB; } @@ -2999,7 +2999,8 @@ case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: return emitIndirectDst(MI, *BB, *getSubtarget()); - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_F32_GE_0_PSEUDO: + case AMDGPU::SI_KILL_I1_PSEUDO: return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); Index: lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertSkips.cpp +++ lib/Target/AMDGPU/SIInsertSkips.cpp @@ -198,17 +198,36 @@ assert(CallConv == CallingConv::AMDGPU_PS || CallConv == CallingConv::AMDGPU_GS); #endif - // Clear this thread from the exec mask if the operand is negative. - if (Op.isImm()) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); - } - } else { + + switch (MI.getOpcode()) { + case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR: BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) .addImm(0) .add(Op); + break; + case AMDGPU::SI_KILL_I1_TERMINATOR: { + int64_t KillVal = MI.getOperand(1).getImm(); + assert(KillVal == 0 || KillVal == -1); + + // Kill all threads if Op0 is an immediate and equal to the Kill value. + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + assert(Imm == 0 || Imm == -1); + + if (Imm == KillVal) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + break; + } + + unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; + BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .add(Op); + break; + } + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } } @@ -301,7 +320,8 @@ } break; - case AMDGPU::SI_KILL_TERMINATOR: + case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: MadeChange = true; kill(MI); Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -856,6 +856,9 @@ MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const; + + static bool isKillTerminator(unsigned Opcode); + const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; }; namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4569,3 +4569,24 @@ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } + +bool SIInstrInfo::isKillTerminator(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: + return true; + default: + return false; + } +} + +const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_GE_0_PSEUDO: + return get(AMDGPU::SI_KILL_F32_GE_0_TERMINATOR); + case AMDGPU::SI_KILL_I1_PSEUDO: + return get(AMDGPU::SI_KILL_I1_TERMINATOR); + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); + } +} Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -277,18 +277,21 @@ } let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : PseudoInstSI < - (outs), (ins VSrc_b32:$src), - [(AMDGPUkill i32:$src)]> { - let isConvergent = 1; - let usesCustomInserter = 1; -} -def SI_KILL_TERMINATOR : SPseudoInstSI < - (outs), (ins VSrc_b32:$src)> { - let isTerminator = 1; +multiclass PseudoInstKill { + def _PSEUDO : PseudoInstSI <(outs), ins> { + let isConvergent = 1; + let usesCustomInserter = 1; + } + + def _TERMINATOR : SPseudoInstSI <(outs), ins> { + let isTerminator = 1; + } } +defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_F32_GE_0 : PseudoInstKill <(ins VSrc_b32:$src0)>; + def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), [], " ; illegal copy $src to $dst">; @@ -551,8 +554,35 @@ def : Pat < (int_AMDGPU_kilp), - (SI_KILL (i32 0xbf800000)) + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) + (AMDGPUkill (i32 -1082130432)), + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + (int_amdgcn_kill i1:$src), + (SI_KILL_I1_PSEUDO $src, 0) +>; + +def : Pat < + (int_amdgcn_kill (i1 (not i1:$src))), + (SI_KILL_I1_PSEUDO $src, -1) +>; + +def : Pat < + (AMDGPUkill i32:$src), + (SI_KILL_F32_GE_0_PSEUDO $src) +>; + +def : Pat < + (int_amdgcn_kill (i1 (setcc f32:$src, FP_ZERO, COND_GE_NONANS))), + (SI_KILL_F32_GE_0_PSEUDO $src) >; +// TODO: we could add more variants for other types of conditionals //===----------------------------------------------------------------------===// // VOP1 Patterns Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -134,7 +134,8 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { unsigned SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -143,7 +144,7 @@ U->getOpcode() != AMDGPU::SI_END_CF) return false; - // Check for SI_KILL_TERMINATOR on path from if to endif. + // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplififcations are not safe. auto SMBB = MI.getParent(); auto EMBB = U->getParent(); @@ -157,7 +158,7 @@ if (MBB == EMBB || !Visited.insert(MBB).second) continue; for(auto &Term : MBB->terminators()) - if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR) + if (TII->isKillTerminator(Term.getOpcode())) return false; Worklist.append(MBB->succ_begin(), MBB->succ_end()); @@ -184,7 +185,7 @@ // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI); + bool SimpleIf = isSimpleIf(MI, MRI, TII); // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3540,6 +3540,14 @@ // wqm_vote is identity when the argument is constant. return replaceInstUsesWith(*II, II->getArgOperand(0)); } + case Intrinsic::amdgcn_kill: { + const ConstantInt *C = dyn_cast(II->getArgOperand(0)); + if (!C || !C->getZExtValue()) + break; + + // amdgcn.kill(i1 1) is a no-op + return eraseInstFromFunction(CI); + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. Index: test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir =================================================================== --- test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -33,7 +33,7 @@ bb.1: successors: %bb.2 %vgpr0 = V_MOV_B32_e32 0, implicit %exec - SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + SI_KILL_F32_GE_0_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec S_BRANCH %bb.2 bb.2: Index: test/CodeGen/AMDGPU/kill.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/kill.ll @@ -0,0 +1,159 @@ +; RUN: opt -S -mtriple=amdgcn-- -instcombine < %s | llc -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s +; RUN: opt -S -mtriple=amdgcn-- -instcombine < %s | llc -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}kill_gs_const: +; SI-NOT: v_cmpx_le_f32 +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @kill_gs_const() { +main_body: + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + call void @llvm.AMDGPU.kill(float %tmp3) + ret void +} + +; SI-LABEL: {{^}}kill_vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { +entry: + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + call void @llvm.AMDGPU.kill(float %arg14) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; SI-LABEL: {{^}}new_kill_gs_const: +; SI-NOT: v_cmpx_le_f32 +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @new_kill_gs_const() { +main_body: + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + %c1 = fcmp oge float %tmp1, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + %c2 = fcmp oge float %tmp3, 0.0 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}new_kill_vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define amdgpu_ps void @new_kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { +entry: + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + %c1 = fcmp oge float %arg14, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; SI-LABEL: {{^}}kill_true: +; SI-NOT: v_cmpx +; SI-NOT: s_mov +define amdgpu_gs void @kill_true() { +main_body: + call void @llvm.amdgcn.kill(i1 true) + ret void +} + +; SI-LABEL: {{^}}kill_false: +; SI-NOT: v_cmpx_le_f32 +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @kill_false() { +main_body: + call void @llvm.amdgcn.kill(i1 false) + ret void +} + +; SI-LABEL: {{^}}kill_and: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_or_b64 s[0:1] +; SI: s_and_b64 exec, exec, s[0:1] +define amdgpu_gs void @kill_and(i32 %a, i32 %b, i32 %c, i32 %d) { +main_body: + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = or i1 %c1, %c2 + call void @llvm.amdgcn.kill(i1 %x) + ret void +} + +; SI-LABEL: {{^}}kill_andn2: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_xor_b64 s[0:1] +; SI: s_andn2_b64 exec, exec, s[0:1] +define amdgpu_gs void @kill_andn2(i32 %a, i32 %b, i32 %c, i32 %d) { +main_body: + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = xor i1 %c1, %c2 + %y = xor i1 %x, 1 + call void @llvm.amdgcn.kill(i1 %y) + ret void +} + +; SI-LABEL: {{^}}kill_oge: +; SI: v_cmpx_le_f32 +; SI-NOT: s_and +define amdgpu_gs void @kill_oge(float %a) { +main_body: + %c1 = fcmp oge float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}kill_neg_olt: +; SI: v_cmpx_le_f32 +; SI-NOT: s_and +define amdgpu_gs void @kill_neg_olt(float %a) { +main_body: + %c1 = fcmp olt float %a, 0.0 + %c2 = xor i1 %c1, 1 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}kill_uge: +; SI: v_cmpx_le_f32 +; SI-NOT: s_and +define amdgpu_gs void @kill_uge(float %a) { +main_body: + %c1 = fcmp uge float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}fcmp_x2: +; FIXME: LLVM should be able to combine these fcmp opcodes. +; SI: v_cmp_gt_f32 +; SI: v_cndmask_b32 +; SI: v_cmpx_le_f32 +define amdgpu_ps void @fcmp_x2(float %a) #0 { +main_body: + %ogt = fcmp nsz ogt float %a, 2.500000e-01 + %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 + %c = fcmp nsz oge float %k, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %c) #1 + ret void +} + +declare void @llvm.AMDGPU.kill(float) #0 +declare void @llvm.amdgcn.kill(i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s - -; SI-LABEL: {{^}}kill_gs_const: -; SI-NOT: v_cmpx_le_f32 -; SI: s_mov_b64 exec, 0 -define amdgpu_gs void @kill_gs_const() { -main_body: - %tmp = icmp ule i32 0, 3 - %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %tmp1) - %tmp2 = icmp ule i32 3, 0 - %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 - call void @llvm.AMDGPU.kill(float %tmp3) - ret void -} - -; SI-LABEL: {{^}}kill_vcc_implicit_def: -; SI-NOT: v_cmp_gt_f32_e32 vcc, -; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} -; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) { -entry: - %tmp0 = fcmp olt float %arg13, 0.000000e+00 - call void @llvm.AMDGPU.kill(float %arg14) - %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 - call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 - ret void -} - -declare void @llvm.AMDGPU.kill(float) #0 -declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 - -attributes #0 = { nounwind }