Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -753,6 +753,9 @@ [llvm_i1_ty], [IntrNoMem, IntrConvergent] >; +// If false, set EXEC=0 for the current thread until the end of program. +def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>; + // Copies the active channels of the source value to the destination value, // with the guarantee that the source value is computed as if the entire // program were executed in Whole Wavefront Mode, i.e. with all channels Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -167,7 +167,6 @@ [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}] >; - def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>; def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -2449,7 +2449,7 @@ if (SplitPoint == BB->end()) { // Don't bother with a new block. - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return BB; } @@ -2463,7 +2463,7 @@ SplitBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(SplitBB); - MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR)); + MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); return SplitBB; } @@ -3017,7 +3017,8 @@ case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: return emitIndirectDst(MI, *BB, *getSubtarget()); - case AMDGPU::SI_KILL: + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + case AMDGPU::SI_KILL_I1_PSEUDO: return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); Index: llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -200,25 +200,101 @@ void SIInsertSkips::kill(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Op = MI.getOperand(0); -#ifndef NDEBUG - CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); - // Kill is only allowed in pixel / geometry shaders. - assert(CallConv == CallingConv::AMDGPU_PS || - CallConv == CallingConv::AMDGPU_GS); -#endif - // Clear this thread from the exec mask if the operand is negative. - if (Op.isImm()) { - // Constant operand: Set exec mask to 0 or do nothing - if (Op.getImm() & 0x80000000) { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) - .addImm(0); + switch (MI.getOpcode()) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: { + unsigned Opcode = 0; + + // The opcodes are inverted because the inline immediate has to be + // the first operand, e.g. from "x < imm" to "imm > x" + switch (MI.getOperand(2).getImm()) { + case ISD::SETOEQ: + case ISD::SETEQ: + Opcode = AMDGPU::V_CMPX_EQ_F32_e32; + break; + case ISD::SETOGT: + case ISD::SETGT: + Opcode = AMDGPU::V_CMPX_LT_F32_e32; + break; + case ISD::SETOGE: + case ISD::SETGE: + Opcode = AMDGPU::V_CMPX_LE_F32_e32; + break; + case ISD::SETOLT: + case ISD::SETLT: + Opcode = AMDGPU::V_CMPX_GT_F32_e32; + break; + case ISD::SETOLE: + case ISD::SETLE: + Opcode = AMDGPU::V_CMPX_GE_F32_e32; + break; + case ISD::SETONE: + case ISD::SETNE: + Opcode = AMDGPU::V_CMPX_LG_F32_e32; + break; + case ISD::SETO: + Opcode = AMDGPU::V_CMPX_O_F32_e32; + break; + case ISD::SETUO: + Opcode = AMDGPU::V_CMPX_U_F32_e32; + break; + case ISD::SETUEQ: + Opcode = AMDGPU::V_CMPX_NLG_F32_e32; + break; + case ISD::SETUGT: + Opcode = AMDGPU::V_CMPX_NGE_F32_e32; + break; + case ISD::SETUGE: + Opcode = AMDGPU::V_CMPX_NGT_F32_e32; + break; + case ISD::SETULT: + Opcode = AMDGPU::V_CMPX_NLE_F32_e32; + break; + case ISD::SETULE: + Opcode = AMDGPU::V_CMPX_NLT_F32_e32; + break; + case ISD::SETUNE: + Opcode = AMDGPU::V_CMPX_NEQ_F32_e32; + break; + default: + llvm_unreachable("invalid ISD:SET cond code"); } - } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) - .addImm(0) + + // TODO: Allow this: + if (!MI.getOperand(0).isReg() || + !TRI->isVGPR(MBB.getParent()->getRegInfo(), + MI.getOperand(0).getReg())) + llvm_unreachable("SI_KILL operand should be a VGPR"); + + BuildMI(MBB, &MI, DL, TII->get(Opcode)) + .add(MI.getOperand(1)) + .add(MI.getOperand(0)); + break; + } + case AMDGPU::SI_KILL_I1_TERMINATOR: { + const MachineOperand &Op = MI.getOperand(0); + int64_t KillVal = MI.getOperand(1).getImm(); + assert(KillVal == 0 || KillVal == -1); + + // Kill all threads if Op0 is an immediate and equal to the Kill value. + if (Op.isImm()) { + int64_t Imm = Op.getImm(); + assert(Imm == 0 || Imm == -1); + + if (Imm == KillVal) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + .addImm(0); + break; + } + + unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64; + BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) .add(Op); + break; + } + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR"); } } @@ -311,7 +387,8 @@ } break; - case AMDGPU::SI_KILL_TERMINATOR: + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: MadeChange = true; kill(MI); Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -857,6 +857,9 @@ MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg) const; + + static bool isKillTerminator(unsigned Opcode); + const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const; }; namespace AMDGPU { Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4591,3 +4591,24 @@ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) .addReg(UnusedCarry, RegState::Define | RegState::Dead); } + +bool SIInstrInfo::isKillTerminator(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: + case AMDGPU::SI_KILL_I1_TERMINATOR: + return true; + default: + return false; + } +} + +const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: + return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR); + case AMDGPU::SI_KILL_I1_PSEUDO: + return get(AMDGPU::SI_KILL_I1_TERMINATOR); + default: + llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO"); + } +} Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -297,6 +297,10 @@ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64); }]>; +def cond_as_i32imm: SDNodeXFormgetTargetConstant(N->get(), SDLoc(N), MVT::i32); +}]>; + // Copied from the AArch64 backend: def bitcast_fpimm_to_i32 : SDNodeXFormgetTargetConstant( Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -275,18 +275,21 @@ } let Uses = [EXEC], Defs = [EXEC,VCC] in { -def SI_KILL : PseudoInstSI < - (outs), (ins VSrc_b32:$src), - [(AMDGPUkill i32:$src)]> { - let isConvergent = 1; - let usesCustomInserter = 1; -} -def SI_KILL_TERMINATOR : SPseudoInstSI < - (outs), (ins VSrc_b32:$src)> { - let isTerminator = 1; +multiclass PseudoInstKill { + def _PSEUDO : PseudoInstSI <(outs), ins> { + let isConvergent = 1; + let usesCustomInserter = 1; + } + + def _TERMINATOR : SPseudoInstSI <(outs), ins> { + let isTerminator = 1; + } } +defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>; +defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; + def SI_ILLEGAL_COPY : SPseudoInstSI < (outs unknown:$dst), (ins unknown:$src), [], " ; illegal copy $src to $dst">; @@ -546,8 +549,35 @@ def : GCNPat < (int_AMDGPU_kilp), - (SI_KILL (i32 0xbf800000)) + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0) + (AMDGPUkill (i32 -1082130432)), + (SI_KILL_I1_PSEUDO (i1 0), 0) +>; + +def : Pat < + (int_amdgcn_kill i1:$src), + (SI_KILL_I1_PSEUDO $src, 0) +>; + +def : Pat < + (int_amdgcn_kill (i1 (not i1:$src))), + (SI_KILL_I1_PSEUDO $src, -1) +>; + +def : Pat < + (AMDGPUkill i32:$src), + (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE +>; + +def : Pat < + (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm:$imm, cond:$cond))), + (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) >; +// TODO: we could add more variants for other types of conditionals //===----------------------------------------------------------------------===// // VOP1 Patterns Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -134,7 +134,8 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; -static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { +static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { unsigned SaveExecReg = MI.getOperand(0).getReg(); auto U = MRI->use_instr_nodbg_begin(SaveExecReg); @@ -143,7 +144,7 @@ U->getOpcode() != AMDGPU::SI_END_CF) return false; - // Check for SI_KILL_TERMINATOR on path from if to endif. + // Check for SI_KILL_*_TERMINATOR on path from if to endif. // if there is any such terminator simplififcations are not safe. auto SMBB = MI.getParent(); auto EMBB = U->getParent(); @@ -157,7 +158,7 @@ if (MBB == EMBB || !Visited.insert(MBB).second) continue; for(auto &Term : MBB->terminators()) - if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR) + if (TII->isKillTerminator(Term.getOpcode())) return false; Worklist.append(MBB->succ_begin(), MBB->succ_end()); @@ -184,7 +185,7 @@ // If there is only one use of save exec register and that use is SI_END_CF, // we can optimize SI_IF by returning the full saved exec mask instead of // just cleared bits. - bool SimpleIf = isSimpleIf(MI, MRI); + bool SimpleIf = isSimpleIf(MI, MRI, TII); // Add an implicit def of exec to discourage scheduling VALU after this which // will interfere with trying to form s_and_saveexec_b64 later. Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3539,6 +3539,14 @@ return replaceInstUsesWith(*II, II->getArgOperand(0)); } + case Intrinsic::amdgcn_kill: { + const ConstantInt *C = dyn_cast(II->getArgOperand(0)); + if (!C || !C->getZExtValue()) + break; + + // amdgcn.kill(i1 1) is a no-op + return eraseInstFromFunction(CI); + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. Index: llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir +++ llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir @@ -33,7 +33,7 @@ bb.1: successors: %bb.2 %vgpr0 = V_MOV_B32_e32 0, implicit %exec - SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec + SI_KILL_F32_COND_IMM_TERMINATOR %vgpr0, 0, 3, implicit-def %exec, implicit-def %vcc, implicit %exec S_BRANCH %bb.2 bb.2: Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -0,0 +1,241 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s + +; SI-LABEL: {{^}}gs_const: +; SI-NOT: v_cmpx +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @gs_const() { + %tmp = icmp ule i32 0, 3 + %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00 + %c1 = fcmp oge float %tmp1, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp2 = icmp ule i32 3, 0 + %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00 + %c2 = fcmp oge float %tmp3, 0.0 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}vcc_implicit_def: +; SI-NOT: v_cmp_gt_f32_e32 vcc, +; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} +; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] +define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { + %tmp0 = fcmp olt float %arg13, 0.000000e+00 + %c1 = fcmp oge float %arg14, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00 + call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0 + ret void +} + +; SI-LABEL: {{^}}true: +; SI-NEXT: BB# +; SI-NEXT: BB# +; SI-NEXT: s_endpgm +define amdgpu_gs void @true() { + call void @llvm.amdgcn.kill(i1 true) + ret void +} + +; SI-LABEL: {{^}}false: +; SI-NOT: v_cmpx +; SI: s_mov_b64 exec, 0 +define amdgpu_gs void @false() { + call void @llvm.amdgcn.kill(i1 false) + ret void +} + +; SI-LABEL: {{^}}and: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_or_b64 s[0:1] +; SI: s_and_b64 exec, exec, s[0:1] +define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) { + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = or i1 %c1, %c2 + call void @llvm.amdgcn.kill(i1 %x) + ret void +} + +; SI-LABEL: {{^}}andn2: +; SI: v_cmp_lt_i32 +; SI: v_cmp_lt_i32 +; SI: s_xor_b64 s[0:1] +; SI: s_andn2_b64 exec, exec, s[0:1] +define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) { + %c1 = icmp slt i32 %a, %b + %c2 = icmp slt i32 %c, %d + %x = xor i1 %c1, %c2 + %y = xor i1 %x, 1 + call void @llvm.amdgcn.kill(i1 %y) + ret void +} + +; SI-LABEL: {{^}}oeq: +; SI: v_cmpx_eq_f32 +; SI-NOT: s_and +define amdgpu_gs void @oeq(float %a) { + %c1 = fcmp oeq float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ogt: +; SI: v_cmpx_lt_f32 +; SI-NOT: s_and +define amdgpu_gs void @ogt(float %a) { + %c1 = fcmp ogt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}oge: +; SI: v_cmpx_le_f32 +; SI-NOT: s_and +define amdgpu_gs void @oge(float %a) { + %c1 = fcmp oge float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}olt: +; SI: v_cmpx_gt_f32 +; SI-NOT: s_and +define amdgpu_gs void @olt(float %a) { + %c1 = fcmp olt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ole: +; SI: v_cmpx_ge_f32 +; SI-NOT: s_and +define amdgpu_gs void @ole(float %a) { + %c1 = fcmp ole float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}one: +; SI: v_cmpx_lg_f32 +; SI-NOT: s_and +define amdgpu_gs void @one(float %a) { + %c1 = fcmp one float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ord: +; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. +; SI: v_cmp_o_f32 +define amdgpu_gs void @ord(float %a) { + %c1 = fcmp ord float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}uno: +; FIXME: This is absolutely unimportant, but we could use the cmpx variant here. +; SI: v_cmp_u_f32 +define amdgpu_gs void @uno(float %a) { + %c1 = fcmp uno float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ueq: +; SI: v_cmpx_nlg_f32 +; SI-NOT: s_and +define amdgpu_gs void @ueq(float %a) { + %c1 = fcmp ueq float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ugt: +; SI: v_cmpx_nge_f32 +; SI-NOT: s_and +define amdgpu_gs void @ugt(float %a) { + %c1 = fcmp ugt float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}uge: +; SI: v_cmpx_ngt_f32_e32 vcc, -1.0 +; SI-NOT: s_and +define amdgpu_gs void @uge(float %a) { + %c1 = fcmp uge float %a, -1.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ult: +; SI: v_cmpx_nle_f32_e32 vcc, -2.0 +; SI-NOT: s_and +define amdgpu_gs void @ult(float %a) { + %c1 = fcmp ult float %a, -2.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}ule: +; SI: v_cmpx_nlt_f32_e32 vcc, 2.0 +; SI-NOT: s_and +define amdgpu_gs void @ule(float %a) { + %c1 = fcmp ule float %a, 2.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}une: +; SI: v_cmpx_neq_f32_e32 vcc, 0 +; SI-NOT: s_and +define amdgpu_gs void @une(float %a) { + %c1 = fcmp une float %a, 0.0 + call void @llvm.amdgcn.kill(i1 %c1) + ret void +} + +; SI-LABEL: {{^}}neg_olt: +; SI: v_cmpx_ngt_f32_e32 vcc, 1.0 +; SI-NOT: s_and +define amdgpu_gs void @neg_olt(float %a) { + %c1 = fcmp olt float %a, 1.0 + %c2 = xor i1 %c1, 1 + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +; SI-LABEL: {{^}}fcmp_x2: +; FIXME: LLVM should be able to combine these fcmp opcodes. +; SI: v_cmp_gt_f32 +; SI: v_cndmask_b32 +; SI: v_cmpx_le_f32 +define amdgpu_ps void @fcmp_x2(float %a) #0 { + %ogt = fcmp nsz ogt float %a, 2.500000e-01 + %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 + %c = fcmp nsz oge float %k, 0.000000e+00 + call void @llvm.amdgcn.kill(i1 %c) #1 + ret void +} + +; SI-LABEL: {{^}}wqm: +; SI: v_cmp_neq_f32_e32 vcc, 0 +; SI: s_wqm_b64 s[0:1], vcc +; SI: s_and_b64 exec, exec, s[0:1] +define amdgpu_ps void @wqm(float %a) { + %c1 = fcmp une float %a, 0.0 + %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) + call void @llvm.amdgcn.kill(i1 %c2) + ret void +} + +declare void @llvm.amdgcn.kill(i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 +declare i1 @llvm.amdgcn.wqm.vote(i1) + +attributes #0 = { nounwind } Index: llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -1570,4 +1570,19 @@ ret float %r } +; -------------------------------------------------------------------- +; llvm.amdgcn.kill +; -------------------------------------------------------------------- + +declare void @llvm.amdgcn.kill(i1) + +; CHECK-LABEL: @kill_true() { +; CHECK-NEXT: ret void +; CHECK-NEXT: } +define void @kill_true() { + call void @llvm.amdgcn.kill(i1 true) + ret void +} + + ; CHECK: attributes #5 = { convergent }