Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -753,6 +753,9 @@
   [llvm_i1_ty], [IntrNoMem, IntrSpeculatable, IntrConvergent]
 >;
 
+// If false, set EXEC=0 for the current thread until the end of program.
+def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
Index: lib/Target/AMDGPU/AMDGPUInstructions.td
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstructions.td
+++ lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -167,6 +167,12 @@
   [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
 >;
 
+def COND_EQ_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOEQ;}]>;
+def COND_NE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETONE;}]>;
+def COND_GT_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOGT;}]>;
+def COND_GE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOGE;}]>;
+def COND_LT_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOLT;}]>;
+def COND_LE_NONANS : PatLeaf <(cond), [{return (N->get() & 0x7) == ISD::SETOLE;}]>;
 
 def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
 def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2431,7 +2431,7 @@
 
   if (SplitPoint == BB->end()) {
     // Don't bother with a new block.
-    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
     return BB;
   }
 
@@ -2445,7 +2445,7 @@
   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
   BB->addSuccessor(SplitBB);
 
-  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
   return SplitBB;
 }
 
@@ -2999,7 +2999,8 @@
   case AMDGPU::SI_INDIRECT_DST_V8:
   case AMDGPU::SI_INDIRECT_DST_V16:
     return emitIndirectDst(MI, *BB, *getSubtarget());
-  case AMDGPU::SI_KILL:
+  case AMDGPU::SI_KILL_F32_GE_0_PSEUDO:
+  case AMDGPU::SI_KILL_I1_PSEUDO:
     return splitKillBlock(MI, BB);
   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Index: lib/Target/AMDGPU/SIInsertSkips.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertSkips.cpp
+++ lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -198,17 +198,36 @@
   assert(CallConv == CallingConv::AMDGPU_PS ||
          CallConv == CallingConv::AMDGPU_GS);
 #endif
-  // Clear this thread from the exec mask if the operand is negative.
-  if (Op.isImm()) {
-    // Constant operand: Set exec mask to 0 or do nothing
-    if (Op.getImm() & 0x80000000) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-        .addImm(0);
-    }
-  } else {
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR:
     BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
         .addImm(0)
         .add(Op);
+    break;
+  case AMDGPU::SI_KILL_I1_TERMINATOR: {
+    int64_t KillVal = MI.getOperand(1).getImm();
+    assert(KillVal == 0 || KillVal == -1);
+
+    // Kill all threads if Op0 is an immediate and equal to the Kill value.
+    if (Op.isImm()) {
+      int64_t Imm = Op.getImm();
+      assert(Imm == 0 || Imm == -1);
+
+      if (Imm == KillVal)
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+          .addImm(0);
+      break;
+    }
+
+    unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
+    BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .add(Op);
+    break;
+  }
+  default:
+    llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
   }
 }
 
@@ -301,7 +320,8 @@
         }
         break;
 
-      case AMDGPU::SI_KILL_TERMINATOR:
+      case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR:
+      case AMDGPU::SI_KILL_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -856,6 +856,9 @@
                                     MachineBasicBlock::iterator I,
                                     const DebugLoc &DL,
                                     unsigned DestReg) const;
+
+  static bool isKillTerminator(unsigned Opcode);
+  const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
 };
 
 namespace AMDGPU {
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4569,3 +4569,24 @@
   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
 }
+
+bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::SI_KILL_F32_GE_0_TERMINATOR:
+  case AMDGPU::SI_KILL_I1_TERMINATOR:
+    return true;
+  default:
+    return false;
+  }
+}
+
+const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::SI_KILL_F32_GE_0_PSEUDO:
+    return get(AMDGPU::SI_KILL_F32_GE_0_TERMINATOR);
+  case AMDGPU::SI_KILL_I1_PSEUDO:
+    return get(AMDGPU::SI_KILL_I1_TERMINATOR);
+  default:
+    llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
+  }
+}
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -277,18 +277,21 @@
 }
 
 let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : PseudoInstSI <
-  (outs), (ins VSrc_b32:$src),
-  [(AMDGPUkill i32:$src)]> {
-  let isConvergent = 1;
-  let usesCustomInserter = 1;
-}
 
-def SI_KILL_TERMINATOR : SPseudoInstSI <
-  (outs), (ins VSrc_b32:$src)> {
-  let isTerminator = 1;
+multiclass PseudoInstKill <dag ins> {
+  def _PSEUDO : PseudoInstSI <(outs), ins> {
+    let isConvergent = 1;
+    let usesCustomInserter = 1;
+  }
+
+  def _TERMINATOR : SPseudoInstSI <(outs), ins> {
+    let isTerminator = 1;
+  }
 }
 
+defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_F32_GE_0 : PseudoInstKill <(ins VSrc_b32:$src0)>;
+
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
   [], " ; illegal copy $src to $dst">;
@@ -551,8 +554,35 @@
 
 def : Pat <
   (int_AMDGPU_kilp),
-  (SI_KILL (i32 0xbf800000))
+  (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+  // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
+  (AMDGPUkill (i32 -1082130432)),
+  (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+  (int_amdgcn_kill i1:$src),
+  (SI_KILL_I1_PSEUDO $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_kill (i1 (not i1:$src))),
+  (SI_KILL_I1_PSEUDO $src, -1)
+>;
+
+def : Pat <
+  (AMDGPUkill i32:$src),
+  (SI_KILL_F32_GE_0_PSEUDO $src)
+>;
+
+def : Pat <
+  (int_amdgcn_kill (i1 (setcc f32:$src, FP_ZERO, COND_GE_NONANS))),
+  (SI_KILL_F32_GE_0_PSEUDO $src)
 >;
+// TODO: we could add more variants for other types of conditionals
 
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
Index: lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -134,7 +134,8 @@
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
+                       const SIInstrInfo *TII) {
   unsigned SaveExecReg = MI.getOperand(0).getReg();
   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
 
@@ -143,7 +144,7 @@
       U->getOpcode() != AMDGPU::SI_END_CF)
     return false;
 
-  // Check for SI_KILL_TERMINATOR on path from if to endif.
+  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
   // if there is any such terminator simplififcations are not safe.
   auto SMBB = MI.getParent();
   auto EMBB = U->getParent();
@@ -157,7 +158,7 @@
     if (MBB == EMBB || !Visited.insert(MBB).second)
       continue;
     for(auto &Term : MBB->terminators())
-      if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR)
+      if (TII->isKillTerminator(Term.getOpcode()))
         return false;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
@@ -184,7 +185,7 @@
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
   // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI);
+  bool SimpleIf = isSimpleIf(MI, MRI, TII);
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
Index: lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3540,6 +3540,14 @@
     // wqm_vote is identity when the argument is constant.
     return replaceInstUsesWith(*II, II->getArgOperand(0));
   }
+  case Intrinsic::amdgcn_kill: {
+    const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
+    if (!C || !C->getZExtValue())
+      break;
+
+    // amdgcn.kill(i1 1) is a no-op
+    return eraseInstFromFunction(CI);
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
Index: test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
===================================================================
--- test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
+++ test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
@@ -33,7 +33,7 @@
   bb.1:
     successors: %bb.2
     %vgpr0 = V_MOV_B32_e32 0, implicit %exec
-    SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec
+    SI_KILL_F32_GE_0_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec
     S_BRANCH %bb.2
 
   bb.2:
Index: test/CodeGen/AMDGPU/kill.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/kill.ll
@@ -0,0 +1,159 @@
+; RUN: opt -S -mtriple=amdgcn-- -instcombine < %s | llc -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: opt -S -mtriple=amdgcn-- -instcombine < %s | llc -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
+define amdgpu_gs void @kill_gs_const() {
+main_body:
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp3)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_vcc_implicit_def:
+; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
+entry:
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  call void @llvm.AMDGPU.kill(float %arg14)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; SI-LABEL: {{^}}new_kill_gs_const:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
+define amdgpu_gs void @new_kill_gs_const() {
+main_body:
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  %c1 = fcmp oge float %tmp1, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  %c2 = fcmp oge float %tmp3, 0.0
+  call void @llvm.amdgcn.kill(i1 %c2)
+  ret void
+}
+
+; SI-LABEL: {{^}}new_kill_vcc_implicit_def:
+; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+define amdgpu_ps void @new_kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
+entry:
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  %c1 = fcmp oge float %arg14, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_true:
+; SI-NOT: v_cmpx
+; SI-NOT: s_mov
+define amdgpu_gs void @kill_true() {
+main_body:
+  call void @llvm.amdgcn.kill(i1 true)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_false:
+; SI-NOT: v_cmpx_le_f32
+; SI: s_mov_b64 exec, 0
+define amdgpu_gs void @kill_false() {
+main_body:
+  call void @llvm.amdgcn.kill(i1 false)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_and:
+; SI: v_cmp_lt_i32
+; SI: v_cmp_lt_i32
+; SI: s_or_b64 s[0:1]
+; SI: s_and_b64 exec, exec, s[0:1]
+define amdgpu_gs void @kill_and(i32 %a, i32 %b, i32 %c, i32 %d) {
+main_body:
+  %c1 = icmp slt i32 %a, %b
+  %c2 = icmp slt i32 %c, %d
+  %x = or i1 %c1, %c2
+  call void @llvm.amdgcn.kill(i1 %x)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_andn2:
+; SI: v_cmp_lt_i32
+; SI: v_cmp_lt_i32
+; SI: s_xor_b64 s[0:1]
+; SI: s_andn2_b64 exec, exec, s[0:1]
+define amdgpu_gs void @kill_andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
+main_body:
+  %c1 = icmp slt i32 %a, %b
+  %c2 = icmp slt i32 %c, %d
+  %x = xor i1 %c1, %c2
+  %y = xor i1 %x, 1
+  call void @llvm.amdgcn.kill(i1 %y)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_oge:
+; SI: v_cmpx_le_f32
+; SI-NOT: s_and
+define amdgpu_gs void @kill_oge(float %a) {
+main_body:
+  %c1 = fcmp oge float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_neg_olt:
+; SI: v_cmpx_le_f32
+; SI-NOT: s_and
+define amdgpu_gs void @kill_neg_olt(float %a) {
+main_body:
+  %c1 = fcmp olt float %a, 0.0
+  %c2 = xor i1 %c1, 1
+  call void @llvm.amdgcn.kill(i1 %c2)
+  ret void
+}
+
+; SI-LABEL: {{^}}kill_uge:
+; SI: v_cmpx_le_f32
+; SI-NOT: s_and
+define amdgpu_gs void @kill_uge(float %a) {
+main_body:
+  %c1 = fcmp uge float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}fcmp_x2:
+; FIXME: LLVM should be able to combine these fcmp opcodes.
+; SI: v_cmp_gt_f32
+; SI: v_cndmask_b32
+; SI: v_cmpx_le_f32
+define amdgpu_ps void @fcmp_x2(float %a) #0 {
+main_body:
+  %ogt = fcmp nsz ogt float %a, 2.500000e-01
+  %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00
+  %c = fcmp nsz oge float %k, 0.000000e+00
+  call void @llvm.amdgcn.kill(i1 %c) #1
+  ret void
+}
+
+declare void @llvm.AMDGPU.kill(float) #0
+declare void @llvm.amdgcn.kill(i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}kill_gs_const:
-; SI-NOT: v_cmpx_le_f32
-; SI: s_mov_b64 exec, 0
-define amdgpu_gs void @kill_gs_const() {
-main_body:
-  %tmp = icmp ule i32 0, 3
-  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %tmp1)
-  %tmp2 = icmp ule i32 3, 0
-  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %tmp3)
-  ret void
-}
-
-; SI-LABEL: {{^}}kill_vcc_implicit_def:
-; SI-NOT: v_cmp_gt_f32_e32 vcc,
-; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
-; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
-entry:
-  %tmp0 = fcmp olt float %arg13, 0.000000e+00
-  call void @llvm.AMDGPU.kill(float %arg14)
-  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
-  ret void
-}
-
-declare void @llvm.AMDGPU.kill(float) #0
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-
-attributes #0 = { nounwind }