diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1349,13 +1349,18 @@
             [IntrNoMem, IntrSpeculatable, IntrWillReturn,
              ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 
-// Pixel shaders only: whether the current pixel is live (i.e. not a helper
-// invocation for derivative computation).
+// Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : Intrinsic <
   [llvm_i1_ty],
   [],
   [IntrNoMem, IntrWillReturn]>;
 
+// Query currently live lanes.
+// Returns true if lane is live (and not a helper lane).
+def int_amdgcn_live_mask : Intrinsic <[llvm_i1_ty],
+  [], [IntrReadMem, IntrInaccessibleMemOnly]
+>;
+
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
@@ -1585,6 +1590,11 @@
   Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects]
 >;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[],
+  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
+>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4194,6 +4194,11 @@
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
+    case Intrinsic::amdgcn_live_mask: {
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+      break;
+    }
+    case Intrinsic::amdgcn_wqm_demote:
     case Intrinsic::amdgcn_kill: {
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -243,6 +243,7 @@
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_live_mask>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -378,6 +378,18 @@
   let SALU = 1;
 }
 
+let Uses = [EXEC] in {
+def SI_LIVE_MASK : PseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_live_mask))]> {
+  let SALU = 1;
+}
+let Defs = [EXEC,SCC] in {
+// Demote: Turn a pixel shader thread into a helper lane.
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>;
+} // End Defs = [EXEC,SCC]
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -751,6 +763,16 @@
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 SCSrc_i1:$src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 SCSrc_i1:$src, -1)
+>;
+
   // TODO: we could add more variants for other types of conditionals
 
 def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -72,6 +72,7 @@
   MachineRegisterInfo *MRI = nullptr;
   SetVector<MachineInstr*> LoweredEndCf;
   DenseSet<Register> LoweredIf;
+  SmallSet<MachineBasicBlock *, 4> KillBlocks;
 
   const TargetRegisterClass *BoolRC = nullptr;
   unsigned AndOpc;
@@ -84,6 +85,8 @@
   unsigned OrSaveExecOpc;
   unsigned Exec;
 
+  bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
+
   void emitIf(MachineInstr &MI);
   void emitElse(MachineInstr &MI);
   void emitIfBreak(MachineInstr &MI);
@@ -161,8 +164,8 @@
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool hasKill(const MachineBasicBlock *Begin,
-                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
+bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
+                                 const MachineBasicBlock *End) {
   DenseSet<const MachineBasicBlock*> Visited;
   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
 
@@ -171,9 +174,8 @@
 
     if (MBB == End || !Visited.insert(MBB).second)
       continue;
-    for (auto &Term : MBB->terminators())
-      if (TII->isKillTerminator(Term.getOpcode()))
-        return true;
+    if (KillBlocks.contains(MBB))
+      return true;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
   }
@@ -213,7 +215,7 @@
     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
     // if there is any such terminator simplifications are not safe.
     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
-    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
   }
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
@@ -799,6 +801,28 @@
     Exec = AMDGPU::EXEC;
   }
 
+  // Compute set of blocks with kills
+  const bool CanDemote =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  for (auto &MBB : MF) {
+    bool IsKillBlock = false;
+    for (auto &Term : MBB.terminators()) {
+      if (TII->isKillTerminator(Term.getOpcode())) {
+        KillBlocks.insert(&MBB);
+        IsKillBlock = true;
+        break;
+      }
+    }
+    if (CanDemote && !IsKillBlock) {
+      for (auto &MI : MBB) {
+        if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+          KillBlocks.insert(&MBB);
+          break;
+        }
+      }
+    }
+  }
+
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin();
        BI != MF.end(); BI = NextBB) {
@@ -848,6 +872,7 @@
 
   LoweredEndCf.clear();
   LoweredIf.clear();
+  KillBlocks.clear();
 
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -457,10 +457,11 @@
         III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
           LiveMaskQueries.push_back(&MI);
         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
-                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR) {
+                   Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
+                   Opcode == AMDGPU::SI_DEMOTE_I1) {
           KillInstrs.push_back(&MI);
           BBI.NeedsLowering = true;
         } else if (WQMOutputs) {
@@ -799,6 +800,7 @@
   const DebugLoc &DL = MI.getDebugLoc();
   MachineInstr *MaskUpdateMI = nullptr;
 
+  const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
   const MachineOperand &Op = MI.getOperand(0);
   int64_t KillVal = MI.getOperand(1).getImm();
   MachineInstr *ComputeKilledMaskMI = nullptr;
@@ -815,10 +817,14 @@
     } else {
       // Static: kill does nothing
       MachineInstr *NewTerm = nullptr;
-      assert(MBB.succ_size() == 1);
-      NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
-                    .addMBB(*MBB.succ_begin());
-      LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+      if (IsDemote) {
+        LIS->RemoveMachineInstrFromMaps(MI);
+      } else {
+        assert(MBB.succ_size() == 1);
+        NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
+                      .addMBB(*MBB.succ_begin());
+        LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
+      }
       MBB.remove(&MI);
       return NewTerm;
     }
@@ -848,17 +854,30 @@
   // In the case we got this far some lanes are still live,
   // update EXEC to deactivate lanes as appropriate.
   MachineInstr *NewTerm;
-  if (Op.isImm()) {
-    unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
-  } else if (!IsWQM) {
-    NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+  MachineInstr *WQMMaskMI = nullptr;
+  Register LiveMaskWQM;
+  if (IsDemote) {
+    // Demotes deactive quads with only helper lanes
+    LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
+    WQMMaskMI =
+        BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
+    NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
                   .addReg(Exec)
-                  .addReg(LiveMaskReg);
+                  .addReg(LiveMaskWQM);
   } else {
-    unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
-    NewTerm =
-        BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+    // Kills deactivate lanes
+    if (Op.isImm()) {
+      unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
+    } else if (!IsWQM) {
+      NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
+                    .addReg(Exec)
+                    .addReg(LiveMaskReg);
+    } else {
+      unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
+      NewTerm =
+          BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
+    }
   }
 
   // Update live intervals
@@ -871,6 +890,8 @@
     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
+  if (WQMMaskMI)
+    LIS->InsertMachineInstrInMaps(*WQMMaskMI);
   LIS->InsertMachineInstrInMaps(*NewTerm);
 
   if (CndReg) {
@@ -879,6 +900,8 @@
   }
   if (TmpReg)
     LIS->createAndComputeVirtRegInterval(TmpReg);
+  if (LiveMaskWQM)
+    LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
 
   return NewTerm;
 }
@@ -910,6 +933,7 @@
 
     MachineInstr *SplitPoint = nullptr;
     switch (MI.getOpcode()) {
+    case AMDGPU::SI_DEMOTE_I1:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
       break;
@@ -1319,6 +1343,7 @@
     MachineBasicBlock *MBB = MI->getParent();
     MachineInstr *SplitPoint = nullptr;
     switch (MI->getOpcode()) {
+    case AMDGPU::SI_DEMOTE_I1:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
       break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,1186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-32 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10-64 %s
+
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+; SI-LABEL: static_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 exec, exec, exec
+; SI-NEXT:    s_cbranch_scc0 BB0_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB0_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: static_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX9-NEXT:    s_cbranch_scc0 BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB0_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: static_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB0_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: static_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB0_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+; SI-LABEL: dynamic_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_cbranch_scc0 BB1_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB1_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: dynamic_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB1_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: dynamic_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
+; GFX10-32-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB1_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: dynamic_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB1_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+; SI-LABEL: branch:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB2_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB2_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: branch:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB2_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB2_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: branch:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB2_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: branch:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], vcc, -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB2_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_1:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB3_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB3_5
+; SI-NEXT:  BB3_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB3_5:
+;
+; GFX9-LABEL: wqm_demote_1:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB3_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB3_5
+; GFX9-NEXT:  BB3_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB3_5:
+;
+; GFX10-32-LABEL: wqm_demote_1:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB3_5
+; GFX10-32-NEXT:  BB3_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB3_5:
+;
+; GFX10-64-LABEL: wqm_demote_1:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB3_5
+; GFX10-64-NEXT:  BB3_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB3_5:
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_2:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB4_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB4_5
+; SI-NEXT:  BB4_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB4_5:
+;
+; GFX9-LABEL: wqm_demote_2:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB4_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB4_5
+; GFX9-NEXT:  BB4_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB4_5:
+;
+; GFX10-32-LABEL: wqm_demote_2:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB4_5
+; GFX10-32-NEXT:  BB4_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB4_5:
+;
+; GFX10-64-LABEL: wqm_demote_2:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB4_5
+; GFX10-64-NEXT:  BB4_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB4_5:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_dynamic:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB5_3
+; SI-NEXT:  BB5_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB5_3:
+;
+; GFX9-LABEL: wqm_demote_dynamic:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_cbranch_scc0 BB5_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB5_3
+; GFX9-NEXT:  BB5_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB5_3:
+;
+; GFX10-32-LABEL: wqm_demote_dynamic:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
+; GFX10-32-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_wqm_b32 s13, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB5_3
+; GFX10-32-NEXT:  BB5_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB5_3:
+;
+; GFX10-64-LABEL: wqm_demote_dynamic:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB5_3
+; GFX10-64-NEXT:  BB5_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB5_3:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+; SI-LABEL: wqm_deriv:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_movk_i32 s2, 0x3c00
+; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s2, s4, 16
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    s_lshl_b32 s3, s3, 16
+; SI-NEXT:    s_or_b32 s3, s4, s3
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
+; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v1, v0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; SI-NEXT:  ; %bb.4: ; %.demote1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.5: ; %.demote1
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB6_7:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.4: ; %.demote1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.5: ; %.demote1
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB6_7:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s2, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_mov_b32 s1, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_b32 s1, s0, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, s1, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB6_7:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB6_7:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+; SI-LABEL: wqm_deriv_loop:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_movk_i32 s2, 0x3c00
+; SI-NEXT:    s_bfe_u32 s4, 0, 0x100000
+; SI-NEXT:    s_bfe_u32 s3, s2, 0x100000
+; SI-NEXT:    s_lshl_b32 s2, s4, 16
+; SI-NEXT:    s_or_b32 s2, s3, s2
+; SI-NEXT:    s_lshl_b32 s3, s3, 16
+; SI-NEXT:    s_or_b32 s3, s4, s3
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_4: ; %.continue1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_8
+; SI-NEXT:  BB7_5: ; %.continue0
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v3, v2
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; SI-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; SI-NEXT:    s_and_b64 s[6:7], s[0:1], vcc
+; SI-NEXT:    s_xor_b64 s[6:7], s[6:7], -1
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_cbranch_execz BB7_4
+; SI-NEXT:  ; %bb.6: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.7: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_branch BB7_4
+; SI-NEXT:  BB7_8: ; %.return
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB7_9:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv_loop:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    s_branch BB7_5
+; GFX9-NEXT:  BB7_4: ; %.continue1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB7_8
+; GFX9-NEXT:  BB7_5: ; %.continue0
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX9-NEXT:    s_cbranch_execz BB7_4
+; GFX9-NEXT:  ; %bb.6: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.7: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_branch BB7_4
+; GFX9-NEXT:  BB7_8: ; %.return
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB7_9:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv_loop:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    s_mov_b32 s1, 0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, s1
+; GFX10-32-NEXT:    s_branch BB7_5
+; GFX10-32-NEXT:  BB7_4: ; %.continue1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, v0, v1
+; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB7_8
+; GFX10-32-NEXT:  BB7_5: ; %.continue0
+; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_mov_b32 s2, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s2
+; GFX10-32-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-32-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX10-32-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
+; GFX10-32-NEXT:    s_and_b32 s2, s0, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, s2, -1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s3, s2
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s3
+; GFX10-32-NEXT:    s_cbranch_execz BB7_4
+; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:    s_branch BB7_4
+; GFX10-32-NEXT:  BB7_8: ; %.return
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB7_9:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv_loop:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0
+; GFX10-64-NEXT:    s_branch BB7_5
+; GFX10-64-NEXT:  BB7_4: ; %.continue1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v1
+; GFX10-64-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_8
+; GFX10-64-NEXT:  BB7_5: ; %.continue0
+; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[4:5]
+; GFX10-64-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-64-NEXT:    v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec
+; GFX10-64-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX10-64-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_4
+; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:    s_branch BB7_4
+; GFX10-64-NEXT:  BB7_8: ; %.return
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB7_9:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.live.mask() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.live.mask.mir
@@ -0,0 +1,16 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: live_mask
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: live_mask
+    ; CHECK: [[INT:%[0-9]+]]:vcc(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
+    ; CHECK: S_ENDPGM 0, implicit [[INT]](s1)
+    %0:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.live.mask)
+    S_ENDPGM 0, implicit %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.demote.mir
@@ -0,0 +1,69 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o -  %s| FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o -  %s| FileCheck %s
+
+---
+name: wqm_demote_scc
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1
+    ; CHECK-LABEL: name: wqm_demote_scc
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+    ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[ICMP]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:vcc(s1) = COPY [[TRUNC]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY2]](s1)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
+...
+
+---
+name: wqm_demote_vcc
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; CHECK-LABEL: name: wqm_demote_vcc
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[ICMP]](s1)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s1) = G_ICMP intpred(eq), %0, %1
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %2
+...
+
+---
+name: wqm_demote_constant_true
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: wqm_demote_constant_true
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true
+    ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
+    %0:_(s1) = G_CONSTANT i1 true
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
+...
+
+---
+name: wqm_demote_constant_false
+legalized: true
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: wqm_demote_constant_false
+    ; CHECK: [[C:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 false
+    ; CHECK: [[COPY:%[0-9]+]]:vcc(s1) = COPY [[C]](s1)
+    ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), [[COPY]](s1)
+    %0:_(s1) = G_CONSTANT i1 false
+    G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.wqm.demote), %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,1177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-64 %s
+
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+; SI-LABEL: static_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_andn2_b64 exec, exec, exec
+; SI-NEXT:    s_cbranch_scc0 BB0_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB0_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: static_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX9-NEXT:    s_cbranch_scc0 BB0_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB0_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: static_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB0_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: static_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB0_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB0_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+; SI-LABEL: dynamic_exact:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_cbranch_scc0 BB1_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB1_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: dynamic_exact:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB1_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: dynamic_exact:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, s0
+; GFX10-32-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB1_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: dynamic_exact:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cmp_le_f32_e64 s[0:1], 0, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB1_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB1_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+; SI-LABEL: branch:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_and_b32_e32 v1, 1, v0
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; SI-NEXT:    s_cbranch_scc0 BB2_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB2_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: branch:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX9-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB2_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB2_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: branch:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-32-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-32-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX10-32-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
+; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB2_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: branch:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-64-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX10-64-NEXT:    v_and_b32_e32 v1, 1, v0
+; GFX10-64-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB2_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_1:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB3_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB3_5
+; SI-NEXT:  BB3_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB3_5:
+;
+; GFX9-LABEL: wqm_demote_1:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB3_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB3_5
+; GFX9-NEXT:  BB3_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB3_5:
+;
+; GFX10-32-LABEL: wqm_demote_1:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB3_5
+; GFX10-32-NEXT:  BB3_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB3_5:
+;
+; GFX10-64-LABEL: wqm_demote_1:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB3_5
+; GFX10-64-NEXT:  BB3_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB3_5:
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_2:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:  ; %bb.1: ; %.demote
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; SI-NEXT:    s_cbranch_scc0 BB4_4
+; SI-NEXT:  ; %bb.2: ; %.demote
+; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
+; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB4_5
+; SI-NEXT:  BB4_4:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB4_5:
+;
+; GFX9-LABEL: wqm_demote_2:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:  ; %bb.1: ; %.demote
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB4_4
+; GFX9-NEXT:  ; %bb.2: ; %.demote
+; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB4_5
+; GFX9-NEXT:  BB4_4:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB4_5:
+;
+; GFX10-32-LABEL: wqm_demote_2:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote
+; GFX10-32-NEXT:    s_wqm_b32 s28, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB4_5
+; GFX10-32-NEXT:  BB4_4:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB4_5:
+;
+; GFX10-64-LABEL: wqm_demote_2:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote
+; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB4_5
+; GFX10-64-NEXT:  BB4_4:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB4_5:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+; SI-LABEL: wqm_demote_dynamic:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[12:13], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; SI-NEXT:    s_cbranch_scc0 BB5_2
+; SI-NEXT:  ; %bb.1: ; %.entry
+; SI-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; SI-NEXT:    s_and_b64 exec, exec, s[14:15]
+; SI-NEXT:    v_add_f32_e32 v0, v0, v0
+; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
+; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_branch BB5_3
+; SI-NEXT:  BB5_2:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB5_3:
+;
+; GFX9-LABEL: wqm_demote_dynamic:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[12:13], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX9-NEXT:    s_cbranch_scc0 BB5_2
+; GFX9-NEXT:  ; %bb.1: ; %.entry
+; GFX9-NEXT:    s_wqm_b64 s[14:15], s[12:13]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[14:15]
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_branch BB5_3
+; GFX9-NEXT:  BB5_2:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB5_3:
+;
+; GFX10-32-LABEL: wqm_demote_dynamic:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s13, vcc_lo, exec_lo
+; GFX10-32-NEXT:    s_andn2_b32 s12, s12, s13
+; GFX10-32-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-32-NEXT:  ; %bb.1: ; %.entry
+; GFX10-32-NEXT:    s_wqm_b32 s13, s12
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s13
+; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-32-NEXT:    s_branch BB5_3
+; GFX10-32-NEXT:  BB5_2:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB5_3:
+;
+; GFX10-64-LABEL: wqm_demote_dynamic:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[12:13], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[14:15], vcc, exec
+; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-64-NEXT:    s_cbranch_scc0 BB5_2
+; GFX10-64-NEXT:  ; %bb.1: ; %.entry
+; GFX10-64-NEXT:    s_wqm_b64 s[28:29], s[12:13]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[28:29]
+; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-64-NEXT:    s_branch BB5_3
+; GFX10-64-NEXT:  BB5_2:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB5_3:
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+; SI-LABEL: wqm_deriv:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
+; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, v0
+; SI-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:  ; %bb.4: ; %.demote1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB6_7
+; SI-NEXT:  ; %bb.5: ; %.demote1
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
+; SI-NEXT:    v_bfrev_b32_e32 v0, 60
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB6_7:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.4: ; %.demote1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB6_7
+; GFX9-NEXT:  ; %bb.5: ; %.demote1
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB6_7:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s2, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_mov_b32 s1, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_xor_b32 s1, s0, -1
+; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB6_7:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
+; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
+; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB6_7:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+; SI-LABEL: wqm_deriv_loop:
+; SI:       ; %bb.0: ; %.entry
+; SI-NEXT:    s_mov_b64 s[0:1], exec
+; SI-NEXT:    s_wqm_b64 exec, exec
+; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:  ; %bb.1: ; %.demote0
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.2: ; %.demote0
+; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
+; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_mov_b64 s[4:5], 0
+; SI-NEXT:    s_branch BB7_5
+; SI-NEXT:  BB7_4: ; %.continue1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
+; SI-NEXT:    s_add_i32 s2, s2, 1
+; SI-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; SI-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_8
+; SI-NEXT:  BB7_5: ; %.continue0
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; SI-NEXT:    s_nop 0
+; SI-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    s_nop 1
+; SI-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; SI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; SI-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; SI-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; SI-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; SI-NEXT:    s_cbranch_execz BB7_4
+; SI-NEXT:  ; %bb.6: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cbranch_scc0 BB7_9
+; SI-NEXT:  ; %bb.7: ; %.demote1
+; SI-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
+; SI-NEXT:    s_branch BB7_4
+; SI-NEXT:  BB7_8: ; %.return
+; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
+; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    v_bfrev_b32_e32 v0, 60
+; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; SI-NEXT:    exp mrt0 v1, v1, v0, v0 done compr vm
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  BB7_9:
+; SI-NEXT:    s_mov_b64 exec, 0
+; SI-NEXT:    exp null off, off, off, off done vm
+; SI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: wqm_deriv_loop:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-NEXT:    s_wqm_b64 exec, exec
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:  ; %bb.1: ; %.demote0
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.2: ; %.demote0
+; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_branch BB7_5
+; GFX9-NEXT:  BB7_4: ; %.continue1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    s_add_i32 s2, s2, 1
+; GFX9-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB7_8
+; GFX9-NEXT:  BB7_5: ; %.continue0
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, 0, s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX9-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX9-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX9-NEXT:    s_cbranch_execz BB7_4
+; GFX9-NEXT:  ; %bb.6: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cbranch_scc0 BB7_9
+; GFX9-NEXT:  ; %bb.7: ; %.demote1
+; GFX9-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX9-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-NEXT:    s_branch BB7_4
+; GFX9-NEXT:  BB7_8: ; %.return
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX9-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX9-NEXT:    s_endpgm
+; GFX9-NEXT:  BB7_9:
+; GFX9-NEXT:    s_mov_b64 exec, 0
+; GFX9-NEXT:    exp null off, off, off, off done vm
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-32-LABEL: wqm_deriv_loop:
+; GFX10-32:       ; %bb.0: ; %.entry
+; GFX10-32-NEXT:    s_mov_b32 s0, exec_lo
+; GFX10-32-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-32-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-32-NEXT:    s_mov_b32 s1, 0
+; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-32-NEXT:    s_wqm_b32 s3, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-32-NEXT:    s_mov_b32 s2, 0
+; GFX10-32-NEXT:    s_branch BB7_5
+; GFX10-32-NEXT:  BB7_4: ; %.continue1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-32-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-32-NEXT:    v_cmp_ge_i32_e32 vcc_lo, s2, v1
+; GFX10-32-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX10-32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB7_8
+; GFX10-32-NEXT:  BB7_5: ; %.continue0
+; GFX10-32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-32-NEXT:    s_mov_b32 s3, s0
+; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s3
+; GFX10-32-NEXT:    s_xor_b32 s3, s0, -1
+; GFX10-32-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-32-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
+; GFX10-32-NEXT:    s_or_b32 s3, s3, vcc_lo
+; GFX10-32-NEXT:    s_and_saveexec_b32 s4, s3
+; GFX10-32-NEXT:    s_xor_b32 s3, exec_lo, s4
+; GFX10-32-NEXT:    s_cbranch_execz BB7_4
+; GFX10-32-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-32-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-32-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-32-NEXT:    s_wqm_b32 s4, s0
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s4
+; GFX10-32-NEXT:    s_branch BB7_4
+; GFX10-32-NEXT:  BB7_8: ; %.return
+; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
+; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-32-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-32-NEXT:    s_endpgm
+; GFX10-32-NEXT:  BB7_9:
+; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
+; GFX10-32-NEXT:    exp null off, off, off, off done vm
+; GFX10-32-NEXT:    s_endpgm
+;
+; GFX10-64-LABEL: wqm_deriv_loop:
+; GFX10-64:       ; %bb.0: ; %.entry
+; GFX10-64-NEXT:    s_mov_b64 s[0:1], exec
+; GFX10-64-NEXT:    s_wqm_b64 exec, exec
+; GFX10-64-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX10-64-NEXT:    s_mov_b32 s2, 0
+; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
+; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
+; GFX10-64-NEXT:    s_branch BB7_5
+; GFX10-64-NEXT:  BB7_4: ; %.continue1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX10-64-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-64-NEXT:    v_cmp_ge_i32_e32 vcc, s2, v1
+; GFX10-64-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX10-64-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_8
+; GFX10-64-NEXT:  BB7_5: ; %.continue0
+; GFX10-64-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-64-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, s2, 0, s[6:7]
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX10-64-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-64-NEXT:    v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:0
+; GFX10-64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec
+; GFX10-64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
+; GFX10-64-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
+; GFX10-64-NEXT:    s_and_saveexec_b64 s[8:9], s[6:7]
+; GFX10-64-NEXT:    s_xor_b64 s[6:7], exec, s[8:9]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_4
+; GFX10-64-NEXT:  ; %bb.6: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
+; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
+; GFX10-64-NEXT:  ; %bb.7: ; %.demote1
+; GFX10-64-NEXT:    ; in Loop: Header=BB7_5 Depth=1
+; GFX10-64-NEXT:    s_wqm_b64 s[8:9], s[0:1]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX10-64-NEXT:    s_branch BB7_4
+; GFX10-64-NEXT:  BB7_8: ; %.return
+; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX10-64-NEXT:    s_and_b64 exec, exec, s[0:1]
+; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
+; GFX10-64-NEXT:    exp mrt0 v0, v0, v1, v1 done compr vm
+; GFX10-64-NEXT:    s_endpgm
+; GFX10-64-NEXT:  BB7_9:
+; GFX10-64-NEXT:    s_mov_b64 exec, 0
+; GFX10-64-NEXT:    exp null off, off, off, off done vm
+; GFX10-64-NEXT:    s_endpgm
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.live.mask()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.wqm.demote(i1) #0
+declare i1 @llvm.amdgcn.live.mask() #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }