Index: llvm/trunk/include/llvm/CodeGen/MachineInstr.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineInstr.h
+++ llvm/trunk/include/llvm/CodeGen/MachineInstr.h
@@ -917,6 +917,10 @@
     return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
   }
 
+  /// Returns true if the MachineInstr has an implicit-use operand of exactly
+  /// the given register (not considering sub/super-registers).
+  bool hasRegisterImplicitUseOperand(unsigned Reg) const;
+
   /// Returns the operand index that is a use of the specific register or -1
   /// if it is not found. It further tightens the search criteria to a use
   /// that kills the register if isKill is true.
Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -328,6 +328,13 @@
             [IntrNoMem]>;  // See int_amdgcn_v_interp_p1 for why this is
                            // IntrNoMem.
 
+// Pixel shaders only: whether the current pixel is live (i.e. not a helper
+// invocation for derivative computation).
+def int_amdgcn_ps_live : Intrinsic <
+  [llvm_i1_ty],
+  [],
+  [IntrNoMem]>;
+
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
Index: llvm/trunk/lib/CodeGen/MachineInstr.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/MachineInstr.cpp
+++ llvm/trunk/lib/CodeGen/MachineInstr.cpp
@@ -1271,6 +1271,17 @@
   return Size;
 }
 
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
 /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
 /// the specific register or -1 if it is not found. It further tightens
 /// the search criteria to a use that kills the register if isKill is true.
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -250,6 +250,7 @@
   case Intrinsic::amdgcn_buffer_atomic_or:
   case Intrinsic::amdgcn_buffer_atomic_xor:
   case Intrinsic::amdgcn_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_ps_live:
     return true;
   }
 
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1597,8 +1597,7 @@
   // Make sure we aren't losing exec uses in the td files. This mostly requires
   // being careful when using let Uses to try to add other use registers.
   if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
-    const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
-    if (!Exec || !Exec->isImplicit()) {
+    if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
       ErrInfo = "VALU instruction does not implicitly read exec mask";
       return false;
     }
Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
@@ -2003,6 +2003,14 @@
 
 } // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
 
+let SALU = 1 in
+def SI_PS_LIVE : InstSI <
+  (outs SReg_64:$dst),
+  (ins),
+  "si_ps_live $dst",
+  [(set i1:$dst, (int_amdgcn_ps_live))]
+>;
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -101,11 +101,12 @@
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<const MachineInstr *, 2> ExecExports;
+  SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
-  char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+  char scanInstructions(MachineFunction &MF, std::vector<WorkItem>& Worklist);
   void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
   void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
-  char analyzeFunction(const MachineFunction &MF);
+  char analyzeFunction(MachineFunction &MF);
 
   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SaveWQM, unsigned LiveMaskReg);
@@ -113,6 +114,8 @@
              unsigned SavedWQM);
   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 
+  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+
 public:
   static char ID;
 
@@ -148,15 +151,15 @@
 
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
-char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
 
   for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
-    const MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock &MBB = *BI;
 
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
-      const MachineInstr &MI = *II;
+      MachineInstr &MI = *II;
       unsigned Opcode = MI.getOpcode();
       char Flags;
 
@@ -167,8 +170,13 @@
         Flags = StateExact;
       } else {
         // Handle export instructions with the exec mask valid flag set
-        if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
-          ExecExports.push_back(&MI);
+        if (Opcode == AMDGPU::EXP) {
+          if (MI.getOperand(4).getImm() != 0)
+            ExecExports.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+          LiveMaskQueries.push_back(&MI);
+        }
+
         continue;
       }
 
@@ -290,7 +298,7 @@
   }
 }
 
-char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   std::vector<WorkItem> Worklist;
   char GlobalFlags = scanInstructions(MF, Worklist);
 
@@ -424,6 +432,16 @@
   }
 }
 
+void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  for (MachineInstr *MI : LiveMaskQueries) {
+    DebugLoc DL = MI->getDebugLoc();
+    unsigned Dest = MI->getOperand(0).getReg();
+    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+        .addReg(LiveMaskReg);
+    MI->eraseFromParent();
+  }
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
     return false;
@@ -431,30 +449,43 @@
   Instructions.clear();
   Blocks.clear();
   ExecExports.clear();
+  LiveMaskQueries.clear();
 
   TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
   TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   MRI = &MF.getRegInfo();
 
   char GlobalFlags = analyzeFunction(MF);
-  if (!(GlobalFlags & StateWQM))
-    return false;
+  if (!(GlobalFlags & StateWQM)) {
+    lowerLiveMaskQueries(AMDGPU::EXEC);
+    return !LiveMaskQueries.empty();
+  }
 
+  // Store a copy of the original live mask when required
   MachineBasicBlock &Entry = MF.front();
   MachineInstr *EntryMI = Entry.getFirstNonPHI();
+  unsigned LiveMaskReg = 0;
+
+  if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
+    LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+        .addReg(AMDGPU::EXEC);
+  }
 
   if (GlobalFlags == StateWQM) {
     // For a shader that needs only WQM, we can just set it once.
     BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+
+    lowerLiveMaskQueries(LiveMaskReg);
+    // EntryMI may become invalid here
     return true;
   }
 
-  // Handle the general case
-  unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
-      .addReg(AMDGPU::EXEC);
+  lowerLiveMaskQueries(LiveMaskReg);
+  EntryMI = nullptr;
 
+  // Handle the general case
   for (const auto &BII : Blocks)
     processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
                  BII.first == &*MF.begin());
Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+;
+; Note: We could generate better code here if we recognized earlier that
+; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
+; the expectation is that the intrinsic will be used in non-trivial shaders,
+; so such an optimization doesn't seem worth the effort.
+define amdgpu_ps float @test1() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+  %r = bitcast i32 %live.32 to float
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test2:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK: image_sample v0, [[VAR]],
+define amdgpu_ps float @test2() {
+  %live = call i1 @llvm.amdgcn.ps.live()
+  %live.32 = zext i1 %live to i32
+
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+; CHECK-LABEL: {{^}}test3:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_wqm_b64 exec, exec
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
+; CHECK: ; %dead
+define amdgpu_ps float @test3(i32 %in) {
+entry:
+  %live = call i1 @llvm.amdgcn.ps.live()
+  br i1 %live, label %end, label %dead
+
+dead:
+  %tc.dead = mul i32 %in, 2
+  br label %end
+
+end:
+  %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
+  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %r = extractelement <4 x float> %t, i32 0
+  ret float %r
+}
+
+declare i1 @llvm.amdgcn.ps.live() #0
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+
+attributes #0 = { nounwind readnone }