Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -753,6 +753,9 @@
   [llvm_i1_ty], [IntrNoMem, IntrConvergent]
 >;
 
+// If false, set EXEC=0 for the current thread until the end of program.
+def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -167,7 +167,6 @@
   [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
 >;
 
-
 def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
 def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
 
Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2449,7 +2449,7 @@
 
   if (SplitPoint == BB->end()) {
     // Don't bother with a new block.
-    MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+    MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
     return BB;
   }
 
@@ -2463,7 +2463,7 @@
   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
   BB->addSuccessor(SplitBB);
 
-  MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
+  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
   return SplitBB;
 }
 
@@ -3017,7 +3017,8 @@
   case AMDGPU::SI_INDIRECT_DST_V8:
   case AMDGPU::SI_INDIRECT_DST_V16:
     return emitIndirectDst(MI, *BB, *getSubtarget());
-  case AMDGPU::SI_KILL:
+  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
+  case AMDGPU::SI_KILL_I1_PSEUDO:
     return splitKillBlock(MI, BB);
   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
Index: llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -200,25 +200,101 @@
 void SIInsertSkips::kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
-  const MachineOperand &Op = MI.getOperand(0);
 
-#ifndef NDEBUG
-  CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
-  // Kill is only allowed in pixel / geometry shaders.
-  assert(CallConv == CallingConv::AMDGPU_PS ||
-         CallConv == CallingConv::AMDGPU_GS);
-#endif
-  // Clear this thread from the exec mask if the operand is negative.
-  if (Op.isImm()) {
-    // Constant operand: Set exec mask to 0 or do nothing
-    if (Op.getImm() & 0x80000000) {
-      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
-        .addImm(0);
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
+    unsigned Opcode = 0;
+
+    // The opcodes are inverted because the inline immediate has to be
+    // the first operand, e.g. from "x < imm" to "imm > x"
+    switch (MI.getOperand(2).getImm()) {
+    case ISD::SETOEQ:
+    case ISD::SETEQ:
+      Opcode = AMDGPU::V_CMPX_EQ_F32_e32;
+      break;
+    case ISD::SETOGT:
+    case ISD::SETGT:
+      Opcode = AMDGPU::V_CMPX_LT_F32_e32;
+      break;
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      Opcode = AMDGPU::V_CMPX_LE_F32_e32;
+      break;
+    case ISD::SETOLT:
+    case ISD::SETLT:
+      Opcode = AMDGPU::V_CMPX_GT_F32_e32;
+      break;
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      Opcode = AMDGPU::V_CMPX_GE_F32_e32;
+      break;
+    case ISD::SETONE:
+    case ISD::SETNE:
+      Opcode = AMDGPU::V_CMPX_LG_F32_e32;
+      break;
+    case ISD::SETO:
+      Opcode = AMDGPU::V_CMPX_O_F32_e32;
+      break;
+    case ISD::SETUO:
+      Opcode = AMDGPU::V_CMPX_U_F32_e32;
+      break;
+    case ISD::SETUEQ:
+      Opcode = AMDGPU::V_CMPX_NLG_F32_e32;
+      break;
+    case ISD::SETUGT:
+      Opcode = AMDGPU::V_CMPX_NGE_F32_e32;
+      break;
+    case ISD::SETUGE:
+      Opcode = AMDGPU::V_CMPX_NGT_F32_e32;
+      break;
+    case ISD::SETULT:
+      Opcode = AMDGPU::V_CMPX_NLE_F32_e32;
+      break;
+    case ISD::SETULE:
+      Opcode = AMDGPU::V_CMPX_NLT_F32_e32;
+      break;
+    case ISD::SETUNE:
+      Opcode = AMDGPU::V_CMPX_NEQ_F32_e32;
+      break;
+    default:
+      llvm_unreachable("invalid ISD:SET cond code");
     }
-  } else {
-    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
-        .addImm(0)
+
+    // TODO: Allow this:
+    if (!MI.getOperand(0).isReg() ||
+        !TRI->isVGPR(MBB.getParent()->getRegInfo(),
+                     MI.getOperand(0).getReg()))
+      llvm_unreachable("SI_KILL operand should be a VGPR");
+
+    BuildMI(MBB, &MI, DL, TII->get(Opcode))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(0));
+    break;
+  }
+  case AMDGPU::SI_KILL_I1_TERMINATOR: {
+    const MachineOperand &Op = MI.getOperand(0);
+    int64_t KillVal = MI.getOperand(1).getImm();
+    assert(KillVal == 0 || KillVal == -1);
+
+    // Kill all threads if Op0 is an immediate and equal to the Kill value.
+    if (Op.isImm()) {
+      int64_t Imm = Op.getImm();
+      assert(Imm == 0 || Imm == -1);
+
+      if (Imm == KillVal)
+        BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+          .addImm(0);
+      break;
+    }
+
+    unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
+    BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
         .add(Op);
+    break;
+  }
+  default:
+    llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
   }
 }
 
@@ -311,7 +387,8 @@
         }
         break;
 
-      case AMDGPU::SI_KILL_TERMINATOR:
+      case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+      case AMDGPU::SI_KILL_I1_TERMINATOR:
         MadeChange = true;
         kill(MI);
 
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
@@ -857,6 +857,9 @@
                                     MachineBasicBlock::iterator I,
                                     const DebugLoc &DL,
                                     unsigned DestReg) const;
+
+  static bool isKillTerminator(unsigned Opcode);
+  const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
 };
 
 namespace AMDGPU {
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4591,3 +4591,24 @@
   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
 }
+
+bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+  case AMDGPU::SI_KILL_I1_TERMINATOR:
+    return true;
+  default:
+    return false;
+  }
+}
+
+const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
+    return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
+  case AMDGPU::SI_KILL_I1_PSEUDO:
+    return get(AMDGPU::SI_KILL_I1_TERMINATOR);
+  default:
+    llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
+  }
+}
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
@@ -297,6 +297,10 @@
   return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
+def cond_as_i32imm: SDNodeXForm<cond, [{
+  return CurDAG->getTargetConstant(N->get(), SDLoc(N), MVT::i32);
+}]>;
+
 // Copied from the AArch64 backend:
 def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
 return CurDAG->getTargetConstant(
Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
@@ -275,18 +275,21 @@
 }
 
 let Uses = [EXEC], Defs = [EXEC,VCC] in {
-def SI_KILL : PseudoInstSI <
-  (outs), (ins VSrc_b32:$src),
-  [(AMDGPUkill i32:$src)]> {
-  let isConvergent = 1;
-  let usesCustomInserter = 1;
-}
 
-def SI_KILL_TERMINATOR : SPseudoInstSI <
-  (outs), (ins VSrc_b32:$src)> {
-  let isTerminator = 1;
+multiclass PseudoInstKill <dag ins> {
+  def _PSEUDO : PseudoInstSI <(outs), ins> {
+    let isConvergent = 1;
+    let usesCustomInserter = 1;
+  }
+
+  def _TERMINATOR : SPseudoInstSI <(outs), ins> {
+    let isTerminator = 1;
+  }
 }
 
+defm SI_KILL_I1 : PseudoInstKill <(ins SSrc_b64:$src, i1imm:$killvalue)>;
+defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
+
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
   [], " ; illegal copy $src to $dst">;
@@ -546,8 +549,35 @@
 
 def : GCNPat <
   (int_AMDGPU_kilp),
-  (SI_KILL (i32 0xbf800000))
+  (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+  // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
+  (AMDGPUkill (i32 -1082130432)),
+  (SI_KILL_I1_PSEUDO (i1 0), 0)
+>;
+
+def : Pat <
+  (int_amdgcn_kill i1:$src),
+  (SI_KILL_I1_PSEUDO $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_kill (i1 (not i1:$src))),
+  (SI_KILL_I1_PSEUDO $src, -1)
+>;
+
+def : Pat <
+  (AMDGPUkill i32:$src),
+  (SI_KILL_F32_COND_IMM_PSEUDO $src, 0, 3) // 3 means SETOGE
+>;
+
+def : Pat <
+  (int_amdgcn_kill (i1 (setcc f32:$src, InlineFPImm<f32>:$imm, cond:$cond))),
+  (SI_KILL_F32_COND_IMM_PSEUDO $src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
+// TODO: we could add more variants for other types of conditionals
 
 //===----------------------------------------------------------------------===//
 // VOP1 Patterns
Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -134,7 +134,8 @@
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
+                       const SIInstrInfo *TII) {
   unsigned SaveExecReg = MI.getOperand(0).getReg();
   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
 
@@ -143,7 +144,7 @@
       U->getOpcode() != AMDGPU::SI_END_CF)
     return false;
 
-  // Check for SI_KILL_TERMINATOR on path from if to endif.
+  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
   // if there is any such terminator simplififcations are not safe.
   auto SMBB = MI.getParent();
   auto EMBB = U->getParent();
@@ -157,7 +158,7 @@
     if (MBB == EMBB || !Visited.insert(MBB).second)
       continue;
     for(auto &Term : MBB->terminators())
-      if (Term.getOpcode() == AMDGPU::SI_KILL_TERMINATOR)
+      if (TII->isKillTerminator(Term.getOpcode()))
         return false;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
@@ -184,7 +185,7 @@
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
   // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI);
+  bool SimpleIf = isSimpleIf(MI, MRI, TII);
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3539,6 +3539,14 @@
 
     return replaceInstUsesWith(*II, II->getArgOperand(0));
   }
+  case Intrinsic::amdgcn_kill: {
+    const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
+    if (!C || !C->getZExtValue())
+      break;
+
+    // amdgcn.kill(i1 1) is a no-op
+    return eraseInstFromFunction(CI);
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
Index: llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
+++ llvm/trunk/test/CodeGen/AMDGPU/insert-skips-kill-uncond.mir
@@ -33,7 +33,7 @@
   bb.1:
     successors: %bb.2
     %vgpr0 = V_MOV_B32_e32 0, implicit %exec
-    SI_KILL_TERMINATOR %vgpr0, implicit-def %exec, implicit-def %vcc, implicit %exec
+    SI_KILL_F32_COND_IMM_TERMINATOR %vgpr0, 0, 3, implicit-def %exec, implicit-def %vcc, implicit %exec
     S_BRANCH %bb.2
 
   bb.2:
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll
@@ -0,0 +1,241 @@
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
+
+; SI-LABEL: {{^}}gs_const:
+; SI-NOT: v_cmpx
+; SI: s_mov_b64 exec, 0
+define amdgpu_gs void @gs_const() {
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  %c1 = fcmp oge float %tmp1, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  %c2 = fcmp oge float %tmp3, 0.0
+  call void @llvm.amdgcn.kill(i1 %c2)
+  ret void
+}
+
+; SI-LABEL: {{^}}vcc_implicit_def:
+; SI-NOT: v_cmp_gt_f32_e32 vcc,
+; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
+; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
+; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) {
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  %c1 = fcmp oge float %arg14, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; SI-LABEL: {{^}}true:
+; SI-NEXT: BB#
+; SI-NEXT: BB#
+; SI-NEXT: s_endpgm
+define amdgpu_gs void @true() {
+  call void @llvm.amdgcn.kill(i1 true)
+  ret void
+}
+
+; SI-LABEL: {{^}}false:
+; SI-NOT: v_cmpx
+; SI: s_mov_b64 exec, 0
+define amdgpu_gs void @false() {
+  call void @llvm.amdgcn.kill(i1 false)
+  ret void
+}
+
+; SI-LABEL: {{^}}and:
+; SI: v_cmp_lt_i32
+; SI: v_cmp_lt_i32
+; SI: s_or_b64 s[0:1]
+; SI: s_and_b64 exec, exec, s[0:1]
+define amdgpu_gs void @and(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %c1 = icmp slt i32 %a, %b
+  %c2 = icmp slt i32 %c, %d
+  %x = or i1 %c1, %c2
+  call void @llvm.amdgcn.kill(i1 %x)
+  ret void
+}
+
+; SI-LABEL: {{^}}andn2:
+; SI: v_cmp_lt_i32
+; SI: v_cmp_lt_i32
+; SI: s_xor_b64 s[0:1]
+; SI: s_andn2_b64 exec, exec, s[0:1]
+define amdgpu_gs void @andn2(i32 %a, i32 %b, i32 %c, i32 %d) {
+  %c1 = icmp slt i32 %a, %b
+  %c2 = icmp slt i32 %c, %d
+  %x = xor i1 %c1, %c2
+  %y = xor i1 %x, 1
+  call void @llvm.amdgcn.kill(i1 %y)
+  ret void
+}
+
+; SI-LABEL: {{^}}oeq:
+; SI: v_cmpx_eq_f32
+; SI-NOT: s_and
+define amdgpu_gs void @oeq(float %a) {
+  %c1 = fcmp oeq float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ogt:
+; SI: v_cmpx_lt_f32
+; SI-NOT: s_and
+define amdgpu_gs void @ogt(float %a) {
+  %c1 = fcmp ogt float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}oge:
+; SI: v_cmpx_le_f32
+; SI-NOT: s_and
+define amdgpu_gs void @oge(float %a) {
+  %c1 = fcmp oge float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}olt:
+; SI: v_cmpx_gt_f32
+; SI-NOT: s_and
+define amdgpu_gs void @olt(float %a) {
+  %c1 = fcmp olt float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ole:
+; SI: v_cmpx_ge_f32
+; SI-NOT: s_and
+define amdgpu_gs void @ole(float %a) {
+  %c1 = fcmp ole float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}one:
+; SI: v_cmpx_lg_f32
+; SI-NOT: s_and
+define amdgpu_gs void @one(float %a) {
+  %c1 = fcmp one float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ord:
+; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
+; SI: v_cmp_o_f32
+define amdgpu_gs void @ord(float %a) {
+  %c1 = fcmp ord float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}uno:
+; FIXME: This is absolutely unimportant, but we could use the cmpx variant here.
+; SI: v_cmp_u_f32
+define amdgpu_gs void @uno(float %a) {
+  %c1 = fcmp uno float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ueq:
+; SI: v_cmpx_nlg_f32
+; SI-NOT: s_and
+define amdgpu_gs void @ueq(float %a) {
+  %c1 = fcmp ueq float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ugt:
+; SI: v_cmpx_nge_f32
+; SI-NOT: s_and
+define amdgpu_gs void @ugt(float %a) {
+  %c1 = fcmp ugt float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}uge:
+; SI: v_cmpx_ngt_f32_e32 vcc, -1.0
+; SI-NOT: s_and
+define amdgpu_gs void @uge(float %a) {
+  %c1 = fcmp uge float %a, -1.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ult:
+; SI: v_cmpx_nle_f32_e32 vcc, -2.0
+; SI-NOT: s_and
+define amdgpu_gs void @ult(float %a) {
+  %c1 = fcmp ult float %a, -2.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}ule:
+; SI: v_cmpx_nlt_f32_e32 vcc, 2.0
+; SI-NOT: s_and
+define amdgpu_gs void @ule(float %a) {
+  %c1 = fcmp ule float %a, 2.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}une:
+; SI: v_cmpx_neq_f32_e32 vcc, 0
+; SI-NOT: s_and
+define amdgpu_gs void @une(float %a) {
+  %c1 = fcmp une float %a, 0.0
+  call void @llvm.amdgcn.kill(i1 %c1)
+  ret void
+}
+
+; SI-LABEL: {{^}}neg_olt:
+; SI: v_cmpx_ngt_f32_e32 vcc, 1.0
+; SI-NOT: s_and
+define amdgpu_gs void @neg_olt(float %a) {
+  %c1 = fcmp olt float %a, 1.0
+  %c2 = xor i1 %c1, 1
+  call void @llvm.amdgcn.kill(i1 %c2)
+  ret void
+}
+
+; SI-LABEL: {{^}}fcmp_x2:
+; FIXME: LLVM should be able to combine these fcmp opcodes.
+; SI: v_cmp_gt_f32
+; SI: v_cndmask_b32
+; SI: v_cmpx_le_f32
+define amdgpu_ps void @fcmp_x2(float %a) #0 {
+  %ogt = fcmp nsz ogt float %a, 2.500000e-01
+  %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00
+  %c = fcmp nsz oge float %k, 0.000000e+00
+  call void @llvm.amdgcn.kill(i1 %c) #1
+  ret void
+}
+
+; SI-LABEL: {{^}}wqm:
+; SI: v_cmp_neq_f32_e32 vcc, 0
+; SI: s_wqm_b64 s[0:1], vcc
+; SI: s_and_b64 exec, exec, s[0:1]
+define amdgpu_ps void @wqm(float %a) {
+  %c1 = fcmp une float %a, 0.0
+  %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1)
+  call void @llvm.amdgcn.kill(i1 %c2)
+  ret void
+}
+
+declare void @llvm.amdgcn.kill(i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare i1 @llvm.amdgcn.wqm.vote(i1)
+
+attributes #0 = { nounwind }
Index: llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
+++ llvm/trunk/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll
@@ -1570,4 +1570,19 @@
   ret float %r
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.kill
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.kill(i1)
+
+; CHECK-LABEL: @kill_true() {
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+define void @kill_true() {
+  call void @llvm.amdgcn.kill(i1 true)
+  ret void
+}
+
+
 ; CHECK: attributes #5 = { convergent }