diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1263,6 +1263,12 @@
   [],
   [IntrNoMem]>;
 
+// Like ps.live, but cannot be moved by LICM.
+// (i.e. this returns true if not a helper)
+def int_amdgcn_wqm_helper : Intrinsic <[llvm_i1_ty],
+  [], [IntrReadMem, IntrInaccessibleMemOnly]
+>;
+
 def int_amdgcn_mbcnt_lo :
   GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -1478,6 +1484,11 @@
 // If false, set EXEC=0 for the current thread until the end of program.
 def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;
 
+// If false, mark all active lanes as helper lanes until the end of program.
+def int_amdgcn_wqm_demote : Intrinsic<[],
+  [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly]
+>;
+
 // Copies the active channels of the source value to the destination value,
 // with the guarantee that the source value is computed as if the entire
 // program were executed in Whole Wavefront Mode, i.e. with all channels
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -239,6 +239,7 @@
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
+def : SourceOfDivergence<int_amdgcn_wqm_helper>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -66,6 +66,8 @@
 
   bool kill(MachineInstr &MI);
 
+  void demoteCleanup(MachineInstr &MI);
+
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
 public:
@@ -342,6 +344,28 @@
   }
 }
 
+void SIInsertSkips::demoteCleanup(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR:
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B32), AMDGPU::EXEC_LO)
+        .addReg(AMDGPU::EXEC_LO)
+        .add(MI.getOperand(0));
+    break;
+
+  case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR:
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .add(MI.getOperand(0));
+    break;
+
+  default:
+    llvm_unreachable("invalid opcode, expected SI_DEMOTE_CLEANUP_*_TERMINATOR");
+  }
+}
+
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
@@ -417,6 +441,18 @@
         break;
       }
 
+      case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR:
+      case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR:
+        assert(MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
+        demoteCleanup(MI);
+        if (dominatesAllReachable(MBB)) {
+          // As with kill we can null export if all lanes are demoted.
+          KillInstrs.push_back(&MI);
+        } else {
+          MI.eraseFromParent();
+        }
+        break;
+
       case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1554,6 +1554,18 @@
     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
     break;
 
+  case AMDGPU::S_AND_B64_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B64));
+    break;
+
+  case AMDGPU::S_AND_B32_term:
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(get(AMDGPU::S_AND_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2107,15 +2119,19 @@
     case AMDGPU::S_MOV_B64_term:
     case AMDGPU::S_XOR_B64_term:
     case AMDGPU::S_ANDN2_B64_term:
+    case AMDGPU::S_AND_B64_term:
     case AMDGPU::S_MOV_B32_term:
     case AMDGPU::S_XOR_B32_term:
     case AMDGPU::S_OR_B32_term:
     case AMDGPU::S_ANDN2_B32_term:
+    case AMDGPU::S_AND_B32_term:
       break;
     case AMDGPU::SI_IF:
     case AMDGPU::SI_ELSE:
     case AMDGPU::SI_KILL_I1_TERMINATOR:
     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
+    case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR:
+    case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR:
       // FIXME: It's messy that these need to be considered here at all.
       return true;
     default:
@@ -6392,6 +6408,8 @@
   switch (Opcode) {
   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
   case AMDGPU::SI_KILL_I1_TERMINATOR:
+  case AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR:
+  case AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -237,6 +237,7 @@
 def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>;
 def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
 def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
+def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
 }
 
 let WaveSizePredicate = isWave32 in {
@@ -244,6 +245,7 @@
 def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
 def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
+def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
 }
 
 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
@@ -371,6 +373,31 @@
   let SALU = 1;
 }
 
+let Uses = [EXEC] in {
+def SI_WQM_HELPER : PseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_wqm_helper))]> {
+  let SALU = 1;
+}
+
+let Defs = [EXEC] in {
+// Demote: Turn a pixel shader thread into a helper lane.
+def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)> {
+}
+// Demote clean up terminators are added to indicate points where the execmask
+// can be cleaned using the specified livemask.
+// This allows the early termination of threads if their quad has become only
+// helper lanes as a result of demotes.
+def SI_DEMOTE_CLEANUP_B32_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b32:$livemask)> {
+  let isTerminator = 1;
+}
+def SI_DEMOTE_CLEANUP_B64_TERMINATOR : SPseudoInstSI <(outs), (ins SSrc_b64:$livemask)> {
+  let isTerminator = 1;
+}
+} // End Defs = [EXEC]
+
+} // End Uses = [EXEC]
+
 def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
   [(int_amdgcn_unreachable)],
   "; divergent unreachable"> {
@@ -745,6 +772,16 @@
   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1)
 >;
 
+def : Pat <
+  (int_amdgcn_wqm_demote i1:$src),
+  (SI_DEMOTE_I1 $src, 0)
+>;
+
+def : Pat <
+  (int_amdgcn_wqm_demote (i1 (not i1:$src))),
+  (SI_DEMOTE_I1 $src, -1)
+>;
+
 def : Pat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -215,6 +215,18 @@
     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B32));
     return true;
   }
+  case AMDGPU::S_AND_B64_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B64));
+    return true;
+  }
+  case AMDGPU::S_AND_B32_term: {
+    // This is only a terminator to get the correct spill code placement during
+    // register allocation.
+    MI.setDesc(TII.get(AMDGPU::S_AND_B32));
+    return true;
+  }
   default:
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -68,11 +68,13 @@
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -134,6 +136,9 @@
   char Needs = 0;
   char InNeeds = 0;
   char OutNeeds = 0;
+  char InitialState = 0;
+  unsigned LiveMaskIn = 0;  // Initial live mask register
+  unsigned LiveMaskOut = 0; // Outgoing live mask register
 };
 
 struct WorkItem {
@@ -153,12 +158,22 @@
   const GCNSubtarget *ST;
   MachineRegisterInfo *MRI;
   LiveIntervals *LIS;
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
-  MapVector<MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<MachineInstr *, 1> LiveMaskQueries;
+  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+
+  // Tracks live mask output of instructions
+  DenseMap<const MachineInstr *, unsigned> LiveMaskRegs;
+  // Tracks state (WQM/WWM/Exact) after a given instruction
+  DenseMap<const MachineInstr *, char> StateTransition;
+
+  SmallVector<MachineInstr *, 2> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
+  SmallVector<MachineInstr *, 4> DemoteInstrs;
+  SmallSet<MachineInstr *, 32> NeedsDemoteCleanup;
 
   void printInfo();
 
@@ -171,12 +186,16 @@
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
 
+  void scanLiveLanes(MachineBasicBlock &MBB,
+                     std::vector<MachineBasicBlock *> &Worklist);
+  void analyzeLiveLanes(MachineFunction &MF);
+
   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator Before);
   MachineBasicBlock::iterator
   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
                    MachineBasicBlock::iterator Last, bool PreferLast,
-                   bool SaveSCC);
+                   bool SaveSCC, bool CheckPhys);
   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
                unsigned SaveWQM, unsigned LiveMaskReg);
   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
@@ -184,11 +203,28 @@
   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
              unsigned SaveOrig);
   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
-               unsigned SavedOrig);
-  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+               unsigned SavedOrig, char NonWWMState);
+
+  bool canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI);
+  MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
+  void lowerBlock(MachineBasicBlock &MBB);
+
+  unsigned findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                           MachineBasicBlock::iterator &Before);
+  void processBlock(MachineBasicBlock &MBB, bool isEntry);
 
-  void lowerLiveMaskQueries(unsigned LiveMaskReg);
+  bool lowerLiveMaskQueries(unsigned LiveMaskReg);
   void lowerCopyInstrs();
+  bool lowerDemoteInstrs();
+
+  void lowerLiveMaskQuery(MachineBasicBlock &MBB, MachineInstr &MI,
+                          unsigned LiveMaskReg, bool isWQM);
+  MachineInstr *lowerDemote(MachineBasicBlock &MBB, MachineInstr &MI,
+                            unsigned LiveMaskIn, unsigned LiveMaskOut,
+                            bool isWQM);
+  MachineInstr *insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI,
+                                    MachineBasicBlock::iterator *Before,
+                                    unsigned LiveMask);
 
 public:
   static char ID;
@@ -201,10 +237,14 @@
   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<LiveIntervals>();
+    AU.addRequired<SlotIndexes>();
     AU.addPreserved<SlotIndexes>();
+    AU.addRequired<LiveIntervals>();
     AU.addPreserved<LiveIntervals>();
-    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addPreserved<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -216,6 +256,8 @@
 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                     false)
 
@@ -323,6 +365,7 @@
   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
     MachineBasicBlock &MBB = **BI;
     BlockInfo &BBI = Blocks[&MBB];
+    bool HasDemoteInBlock = false;
 
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
@@ -381,8 +424,39 @@
         III.Disabled = StateWQM | StateWWM;
         continue;
       } else {
-        if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_WQM_HELPER) {
           LiveMaskQueries.push_back(&MI);
+        } else if (Opcode == AMDGPU::SI_DEMOTE_I1) {
+          // Only perform a demote dominance test once per block
+          if (!HasDemoteInBlock) {
+            SmallVector<MachineInstr *, 4> ControlFlowInstrs;
+            bool DominatesAllReachable = true;
+
+            // Simultaneously check if this demote is in control flow
+            // (dominates all blocks) and find all control flow ends
+            // which post dominate this block.
+            for (MachineBasicBlock *Other : depth_first(&MBB)) {
+              if (DominatesAllReachable && !MDT->dominates(&MBB, Other))
+                DominatesAllReachable = false;
+              if (PDT->dominates(Other, &MBB)) {
+                auto FirstMI = Other->getFirstNonPHI();
+                if ((FirstMI != Other->end()) &&
+                    (FirstMI->getOpcode() == AMDGPU::SI_END_CF)) {
+                  ControlFlowInstrs.push_back(&*FirstMI);
+                }
+              }
+            }
+
+            if (!DominatesAllReachable) {
+              // Demote is in control flow hence we must mark all control
+              // flow end instructions requiring clean up.
+              for (MachineInstr *CF : ControlFlowInstrs)
+                NeedsDemoteCleanup.insert(CF);
+            }
+          }
+
+          DemoteInstrs.push_back(&MI);
+          HasDemoteInBlock = true;
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -524,6 +598,115 @@
   return GlobalFlags;
 }
 
+// Trace live mask manipulate through block, creating new virtual registers.
+// Additionally insert PHI nodes when block has multiple predecessors
+// which manipulated the mask.
+void SIWholeQuadMode::scanLiveLanes(
+    MachineBasicBlock &MBB, std::vector<MachineBasicBlock *> &Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  if (BI.LiveMaskIn && BI.LiveMaskOut)
+    return; // Block has been fully traced already.
+
+  if (!BI.LiveMaskIn) {
+    // Find the incoming live mask, or insert PHI if there are multiple.
+    unsigned LastPredReg = 0;
+    unsigned Count = 0;
+    bool Valid = true;
+
+    // Find predecessor live masks while performing basic deduplication.
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      BlockInfo &PredBI = Blocks[Pred];
+      if (!PredBI.LiveMaskOut) {
+        Valid = false;
+        break;
+      }
+      if (PredBI.LiveMaskOut != LastPredReg) {
+        LastPredReg = PredBI.LiveMaskOut;
+        Count++;
+      }
+    }
+
+    if (Valid) {
+      // All predecessors have live mask outputs.
+      if (Count > 1) {
+        BI.LiveMaskIn = MRI->createVirtualRegister(TRI->getBoolRC());
+        MachineInstrBuilder PHI =
+            BuildMI(MBB, MBB.begin(), DebugLoc(), TII->get(TargetOpcode::PHI),
+                    BI.LiveMaskIn);
+        for (MachineBasicBlock *Pred : MBB.predecessors()) {
+          BlockInfo &PredBI = Blocks[Pred];
+          PHI.addReg(PredBI.LiveMaskOut);
+          PHI.addMBB(Pred);
+        }
+        LIS->InsertMachineInstrInMaps(*PHI);
+      } else {
+        BI.LiveMaskIn = LastPredReg;
+      }
+    } else {
+      // Not all predecessor blocks have live mask outputs,
+      // so this block will need to be revisited.
+
+      if (!BI.LiveMaskOut) {
+        // Give this block a live mask output to ensure forward progress.
+        BI.LiveMaskOut = MRI->createVirtualRegister(TRI->getBoolRC());
+      }
+
+      // Queue this block to be revisited and visit predecessors.
+      Worklist.push_back(&MBB);
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        BlockInfo &PredBI = Blocks[Pred];
+        if (!PredBI.LiveMaskOut)
+          Worklist.push_back(Pred);
+      }
+      return;
+    }
+  }
+
+  assert(BI.LiveMaskIn);
+
+  // Now that the initial live mask register is known the block can
+  // be traced and intermediate live mask registers assigned for instructions
+  // which manipulate the mask.
+  unsigned CurrentLive = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+      unsigned NewLive = MRI->createVirtualRegister(TRI->getBoolRC());
+      LiveMaskRegs[&MI] = NewLive;
+      CurrentLive = NewLive;
+    }
+    II++;
+  }
+
+  // If an output register was assigned to guarantee forward progress
+  // then it is possible the current live register will not become the output
+  // live mask register.  This will be resolved during block lowering.
+  if (!BI.LiveMaskOut) {
+    BI.LiveMaskOut = CurrentLive;
+  }
+}
+
+// Scan blocks for live mask manipulation operations in reverse post order
+// to minimise rescans: a block will have to be rescanned if it's
+// predecessors live mask output is not defined.
+void SIWholeQuadMode::analyzeLiveLanes(MachineFunction &MF) {
+  std::vector<MachineBasicBlock *> Worklist;
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
+    MachineBasicBlock &MBB = **BI;
+    scanLiveLanes(MBB, Worklist);
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.back();
+    Worklist.pop_back();
+    scanLiveLanes(*MBB, Worklist);
+  }
+}
+
 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator Before) {
@@ -548,7 +731,8 @@
 // instructions we want to add necessarily clobber SCC.
 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
-    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
+    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC,
+    bool CheckPhys) {
   if (!SaveSCC)
     return PreferLast ? Last : First;
 
@@ -581,9 +765,25 @@
 
   MachineBasicBlock::iterator MBBI;
 
-  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
+  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx)) {
     MBBI = MI;
-  else {
+
+    if (CheckPhys) {
+      // Make sure insertion point is after any COPY instructions
+      // accessing physical live in registers.  This is ensures that
+      // block splitting does not occur before all live ins have been copied.
+      while (MBBI != Last) {
+        if (MBBI->getOpcode() != AMDGPU::COPY)
+          break;
+        unsigned Register = MBBI->getOperand(1).getReg();
+        if (!Register::isVirtualRegister(Register) && MBB.isLiveIn(Register)) {
+          MBBI++;
+        } else {
+          break;
+        }
+      }
+    }
+  } else {
     assert(Idx == LIS->getMBBEndIdx(&MBB));
     MBBI = MBB.end();
   }
@@ -614,6 +814,7 @@
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateExact;
 }
 
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
@@ -633,6 +834,7 @@
   }
 
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWQM;
 }
 
 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
@@ -644,11 +846,12 @@
   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
            .addImm(-1);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = StateWWM;
 }
 
 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
-                              unsigned SavedOrig) {
+                              unsigned SavedOrig, char NonWWMState) {
   MachineInstr *MI;
 
   assert(SavedOrig);
@@ -656,20 +859,285 @@
                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
            .addReg(SavedOrig);
   LIS->InsertMachineInstrInMaps(*MI);
+  StateTransition[MI] = NonWWMState;
+}
+
+void SIWholeQuadMode::lowerLiveMaskQuery(MachineBasicBlock &MBB,
+                                         MachineInstr &MI, unsigned LiveMaskReg,
+                                         bool isWQM) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  unsigned Dest = MI.getOperand(0).getReg();
+  MachineInstr *Copy =
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dest).addReg(LiveMaskReg);
+  LIS->ReplaceMachineInstrInMaps(MI, *Copy);
+  MBB.remove(&MI);
+}
+
+MachineInstr *
+SIWholeQuadMode::insertDemoteCleanup(MachineBasicBlock &MBB, MachineInstr *MI,
+                                     MachineBasicBlock::iterator *Before,
+                                     unsigned LiveMask) {
+  const DebugLoc &DL = DebugLoc();
+  const unsigned TermOp = ST->isWave32()
+                              ? AMDGPU::SI_DEMOTE_CLEANUP_B32_TERMINATOR
+                              : AMDGPU::SI_DEMOTE_CLEANUP_B64_TERMINATOR;
+  const unsigned WQMOp = ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
+  unsigned LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
+
+  MachineInstr *LiveMaskMI =
+      BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(WQMOp), LiveMaskWQM)
+          .addReg(LiveMask);
+  MachineInstr *NewTerm = BuildMI(MBB, MI ? *MI : *Before, DL, TII->get(TermOp))
+                              .addReg(LiveMaskWQM);
+
+  LIS->InsertMachineInstrInMaps(*LiveMaskMI);
+  LIS->InsertMachineInstrInMaps(*NewTerm);
+
+  return NewTerm;
+}
+
+// Lower an instruction which demotes lanes to helpers by adding
+// appropriate live mask manipulation.  Note this is also applied to kills.
+MachineInstr *SIWholeQuadMode::lowerDemote(MachineBasicBlock &MBB,
+                                           MachineInstr &MI,
+                                           unsigned LiveMaskIn,
+                                           unsigned LiveMaskOut, bool isWQM) {
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const unsigned AndN2 =
+      ST->isWave32() ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+  const unsigned And = ST->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstr *NewMI = nullptr;
+
+  const MachineOperand &Op = MI.getOperand(0);
+  int64_t KillVal = MI.getOperand(1).getImm();
+  if (Op.isImm()) {
+    int64_t Imm = Op.getImm();
+    if (Imm == KillVal) {
+      NewMI = BuildMI(MBB, MI, DL, TII->get(AndN2), LiveMaskOut)
+                  .addReg(LiveMaskIn)
+                  .addReg(Exec);
+    }
+  } else {
+    unsigned Opcode = KillVal ? AndN2 : And;
+    NewMI = BuildMI(MBB, MI, DL, TII->get(Opcode), LiveMaskOut)
+                .addReg(LiveMaskIn)
+                .add(Op);
+  }
+
+  if (NewMI) {
+    LIS->InsertMachineInstrInMaps(*NewMI);
+  }
+
+  if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
+    if (isWQM) {
+      // Inside WQM demotes are replaced with live mask manipulation
+      // and a terminator which is later lowered to remove unused helpers
+      MachineInstr *NewTerm =
+          insertDemoteCleanup(MBB, &MI, nullptr, LiveMaskOut);
+      LIS->RemoveMachineInstrFromMaps(MI);
+      MBB.remove(&MI);
+      return NewTerm;
+    } else {
+      // Outside WQM demotes become kills terminating the block
+      MI.setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR));
+      return &MI;
+    }
+  }
+
+  return nullptr;
+}
+
+bool SIWholeQuadMode::canSplitBlockAt(MachineBasicBlock *BB, MachineInstr *MI) {
+  // Cannot split immediately before the epilog
+  // because there are values in physical registers
+  if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    return false;
+  }
+
+  return true;
 }
 
-void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
-                                   bool isEntry) {
+MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
+                                               MachineInstr *TermMI) {
+  MachineBasicBlock::iterator SplitPoint(TermMI);
+  SplitPoint++;
+
+  LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
+                    << *TermMI << "\n");
+
+  MachineBasicBlock *SplitBB = nullptr;
+
+  // Only split the block if the split point is not
+  // already the end of the block.
+  if ((SplitPoint != BB->getFirstTerminator()) && (SplitPoint != BB->end())) {
+    MachineFunction *MF = BB->getParent();
+    SplitBB = MF->CreateMachineBasicBlock(BB->getBasicBlock());
+
+    MachineFunction::iterator MBBI(BB);
+    ++MBBI;
+    MF->insert(MBBI, SplitBB);
+
+    SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
+    SplitBB->transferSuccessorsAndUpdatePHIs(BB);
+    BB->addSuccessor(SplitBB);
+
+    // Update dominator trees
+    using DomTreeT = DomTreeBase<MachineBasicBlock>;
+    SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+    for (MachineBasicBlock *Succ : SplitBB->successors()) {
+      DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+      DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
+    }
+    DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
+    if (MDT)
+      MDT->getBase().applyUpdates(DTUpdates);
+    if (PDT)
+      PDT->getBase().applyUpdates(DTUpdates);
+
+    // Update live intervals
+    MachineInstr &InsertionPoint = SplitBB->front();
+    LIS->insertMBBInMaps(SplitBB, &InsertionPoint);
+  }
+
+  // Convert last instruction in to a terminator.
+  // Note: this only covers the expected patterns
+  switch (TermMI->getOpcode()) {
+  case AMDGPU::S_AND_B32:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B32_term));
+    break;
+  case AMDGPU::S_AND_B64:
+    TermMI->setDesc(TII->get(AMDGPU::S_AND_B64_term));
+    break;
+  default:
+    if (BB->getFirstTerminator() == BB->end()) {
+      assert(SplitBB != nullptr);
+      MachineInstr *MI =
+          BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
+              .addMBB(SplitBB);
+      LIS->InsertMachineInstrInMaps(*MI);
+    }
+    break;
+  }
+
+  return SplitBB;
+}
+
+// Replace (or supplement) instructions accessing live mask.
+// This can only happen once all the live mask registers have been created
+// and the execute state (WQM/WWM/Exact) of instructions is known.
+void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
   auto BII = Blocks.find(&MBB);
   if (BII == Blocks.end())
     return;
 
+  LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
+
   const BlockInfo &BI = BII->second;
 
+  SmallVector<MachineInstr *, 4> SplitPoints;
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  char State = BI.InitialState;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    auto Next = std::next(II);
+    MachineInstr &MI = *II;
+
+    if (StateTransition.count(&MI)) {
+      // Mark transitions to Exact mode as split points so they become
+      // block terminators.
+      if (State != StateTransition[&MI] && StateTransition[&MI] == StateExact) {
+        if (State != StateWWM && canSplitBlockAt(&MBB, &MI))
+          SplitPoints.push_back(&MI);
+      }
+      State = StateTransition[&MI];
+    }
+
+    switch (MI.getOpcode()) {
+    case AMDGPU::SI_PS_LIVE:
+    case AMDGPU::SI_WQM_HELPER:
+      lowerLiveMaskQuery(MBB, MI, LiveMaskReg, State == StateWQM);
+      break;
+    case AMDGPU::SI_DEMOTE_I1: {
+      MachineInstr *SplitPoint = lowerDemote(
+          MBB, MI, LiveMaskReg, LiveMaskRegs[&MI], State == StateWQM);
+      if (SplitPoint)
+        SplitPoints.push_back(SplitPoint);
+      break;
+    }
+    case AMDGPU::SI_END_CF:
+      if ((State == StateWQM) && NeedsDemoteCleanup.count(&MI)) {
+        MachineInstr *NewTerm =
+            insertDemoteCleanup(MBB, nullptr, &Next, LiveMaskReg);
+        SplitPoints.push_back(NewTerm);
+      }
+      break;
+    default:
+      break;
+    }
+
+    if (LiveMaskRegs.count(&MI))
+      LiveMaskReg = LiveMaskRegs[&MI];
+
+    II = Next;
+  }
+
+  if (BI.LiveMaskOut != LiveMaskReg) {
+    // If the final live mask register does not match the expected
+    // register of successor blocks then insert a copy.
+    MachineBasicBlock::instr_iterator Terminator =
+        MBB.getFirstInstrTerminator();
+    MachineInstr *MI = BuildMI(MBB, Terminator, DebugLoc(),
+                               TII->get(AMDGPU::COPY), BI.LiveMaskOut)
+                           .addReg(LiveMaskReg);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
+
+  // Perform splitting after instruction scan to simplify iteration.
+  if (!SplitPoints.empty()) {
+    MachineBasicBlock *BB = &MBB;
+    for (MachineInstr *MI : SplitPoints) {
+      BB = splitBlock(BB, MI);
+    }
+  }
+}
+
+unsigned SIWholeQuadMode::findLiveMaskReg(MachineBasicBlock &MBB, BlockInfo &BI,
+                                          MachineBasicBlock::iterator &Before) {
+  assert(BI.LiveMaskIn);
+  if (BI.LiveMaskIn == BI.LiveMaskOut)
+    return BI.LiveMaskIn;
+
+  // FIXME: make this more efficient than scanning all instructions in a block
+  unsigned LiveMaskReg = BI.LiveMaskIn;
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+
+  while ((II != IE) && (II != Before)) {
+    MachineInstr *I = &*II;
+    if (LiveMaskRegs.count(I))
+      LiveMaskReg = LiveMaskRegs[I];
+    II++;
+  }
+
+  assert(LiveMaskReg);
+  return LiveMaskReg;
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  BlockInfo &BI = BII->second;
+
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
+    BI.InitialState = StateWQM;
     return;
+  }
 
   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
                     << ":\n");
@@ -694,6 +1162,10 @@
   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
   // switch to/from WQM as well.
   MachineBasicBlock::iterator FirstWWM = IE;
+
+  // Record initial state is block information.
+  BI.InitialState = State;
+
   for (;;) {
     MachineBasicBlock::iterator Next = II;
     char Needs = StateExact | StateWQM; // WWM is disabled by default
@@ -755,13 +1227,13 @@
         First = FirstWQM;
       }
 
-      MachineBasicBlock::iterator Before =
-          prepareInsertion(MBB, First, II, Needs == StateWQM,
-                           Needs == StateExact || WQMFromExec);
+      MachineBasicBlock::iterator Before = prepareInsertion(
+          MBB, First, II, Needs == StateWQM, Needs == StateExact || WQMFromExec,
+          Needs == StateExact && isEntry);
 
       if (State == StateWWM) {
         assert(SavedNonWWMReg);
-        fromWWM(MBB, Before, SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
         State = NonWWMState;
       }
 
@@ -775,7 +1247,7 @@
           if (!WQMFromExec && (OutNeeds & StateWQM))
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
 
-          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          toExact(MBB, Before, SavedWQMReg, findLiveMaskReg(MBB, BI, Before));
           State = StateExact;
         } else if (State == StateExact && (Needs & StateWQM) &&
                    !(Needs & StateExact)) {
@@ -804,11 +1276,13 @@
 
     if (II == IE)
       break;
+
     II = Next;
   }
 }
 
-void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+bool SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
+  bool Changed = false;
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     Register Dest = MI->getOperand(0).getReg();
@@ -818,7 +1292,20 @@
 
     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
     MI->eraseFromParent();
+    Changed = true;
   }
+  return Changed;
+}
+
+bool SIWholeQuadMode::lowerDemoteInstrs() {
+  bool Changed = false;
+  for (MachineInstr *MI : DemoteInstrs) {
+    MachineBasicBlock *MBB = MI->getParent();
+    MI->setDesc(TII->get(AMDGPU::SI_KILL_I1_TERMINATOR));
+    splitBlock(MBB, MI);
+    Changed = true;
+  }
+  return Changed;
 }
 
 void SIWholeQuadMode::lowerCopyInstrs() {
@@ -865,6 +1352,10 @@
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
   LowerToMovInstrs.clear();
+  DemoteInstrs.clear();
+  LiveMaskRegs.clear();
+  StateTransition.clear();
+
   CallingConv = MF.getFunction().getCallingConv();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -873,39 +1364,55 @@
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
+
+  const char GlobalFlags = analyzeFunction(MF);
+  const bool NeedsLiveMask = !(DemoteInstrs.empty() && LiveMaskQueries.empty());
+  const unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  unsigned LiveMaskReg = Exec;
+
+  if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() &&
+      LowerToMovInstrs.empty()) {
+    // Shader only needs Exact mode
+    const bool LoweredQueries = lowerLiveMaskQueries(LiveMaskReg);
+    const bool LoweredDemotes = lowerDemoteInstrs();
+    return LoweredQueries || LoweredDemotes;
+  }
 
-  char GlobalFlags = analyzeFunction(MF);
-  unsigned LiveMaskReg = 0;
-  unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  if (!(GlobalFlags & StateWQM)) {
-    lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
-      return !LiveMaskQueries.empty();
-  } else {
-    // Store a copy of the original live mask when required
-    MachineBasicBlock &Entry = MF.front();
-    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
-
-    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
-      LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
-      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
-                                 TII->get(AMDGPU::COPY), LiveMaskReg)
-                             .addReg(Exec);
-      LIS->InsertMachineInstrInMaps(*MI);
-    }
+  MachineBasicBlock &Entry = MF.front();
+  MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
 
-    lowerLiveMaskQueries(LiveMaskReg);
+  // Store a copy of the original live mask when required
+  if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
+    LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
+    MachineInstr *MI =
+        BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+            .addReg(Exec);
+    LIS->InsertMachineInstrInMaps(*MI);
+  }
 
-    if (GlobalFlags == StateWQM) {
-      // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
-                AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
-              Exec)
-          .addReg(Exec);
+  if ((GlobalFlags == StateWQM) && DemoteInstrs.empty()) {
+    // Shader only needs WQM
+    BuildMI(Entry, EntryMI, DebugLoc(),
+            TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
+            Exec)
+        .addReg(Exec);
 
-      lowerCopyInstrs();
-      // EntryMI may become invalid here
-      return true;
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerCopyInstrs();
+    return true;
+  }
+
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    BlockInfo &BI = Blocks[&Entry];
+    BI.LiveMaskIn = LiveMaskReg;
+    analyzeLiveLanes(MF);
+  } else {
+    for (auto BII : Blocks) {
+      BlockInfo &BI = Blocks[&*BII.first];
+      BI.LiveMaskIn = LiveMaskReg;
+      BI.LiveMaskOut = LiveMaskReg;
     }
   }
 
@@ -913,14 +1420,20 @@
 
   lowerCopyInstrs();
 
-  // Handle the general case
-  for (auto BII : Blocks)
-    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  for (auto BII : Blocks) {
+    processBlock(*BII.first, BII.first == &Entry);
+  }
 
-  // Physical registers like SCC aren't tracked by default anyway, so just
-  // removing the ranges we computed is the simplest option for maintaining
-  // the analysis results.
-  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
+  if (NeedsLiveMask && (GlobalFlags & StateWQM)) {
+    // Lowering blocks causes block splitting.
+    // Hence live ranges and slot indexes cease to be valid here.
+    for (auto BII : Blocks) {
+      lowerBlock(*BII.first);
+    }
+  } else {
+    lowerLiveMaskQueries(LiveMaskReg);
+    lowerDemoteInstrs();
+  }
 
   return true;
 }
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -49,6 +49,14 @@
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call i1 @llvm.amdgcn.wqm.helper()
+define amdgpu_kernel void @wqm_helper(i32 addrspace(1)* %out) #0 {
+  %tmp0 = call i1 @llvm.amdgcn.wqm.helper()
+  %tmp1 = select i1 %tmp0, i32 0, i32 1
+  store i32 %tmp1, i32 addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
@@ -56,6 +64,8 @@
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
+declare i1 @llvm.amdgcn.wqm.helper() #2
 
 attributes #0 = { nounwind convergent }
 attributes #1 = { nounwind readnone convergent }
+attributes #2 = { nounwind readonly inaccessiblememonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -0,0 +1,354 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,GCN-64,PRE-GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GCN-64,PRE-GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-32 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GCN-64 %s
+
+; GCN-LABEL: {{^}}static_exact:
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @static_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}dynamic_exact:
+; GCN-32: v_cmp_le_f32_e64 [[CND:s[0-9]+]], 0, v1
+; GCN-64: v_cmp_le_f32_e64 [[CND:s\[[0-9]+:[0-9]+\]]], 0, v1
+; GCN-32: v_cmp_gt_f32_e32 [[CMP:vcc_lo]], 0, v0
+; GCN-64: v_cmp_gt_f32_e32 [[CMP:vcc]], 0, v0
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[CND]]
+; GCN-64: s_and_b64 exec, exec, [[CND]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @dynamic_exact(float %arg0, float %arg1) {
+.entry:
+  %c0 = fcmp olt float %arg0, 0.000000e+00
+  %c1 = fcmp oge float %arg1, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %tmp1 = select i1 %c0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}branch:
+; GCN-32: s_and_saveexec_b32 s1, s0
+; GCN-64: s_and_saveexec_b64 s[2:3], s[0:1]
+; GCN-32: s_xor_b32 s0, exec_lo, s1
+; GCN-64: s_xor_b64 s[0:1], exec, s[2:3]
+; GCN-32: s_mov_b32 exec_lo, 0
+; GCN-64: s_mov_b64 exec, 0
+; GCN-32: s_or_b32 exec_lo, exec_lo, s0
+; GCN-64: s_or_b64 exec, exec, s[0:1]
+; GCN: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GCN: exp mrt1 v0, v0, v0, v0 done vm
+define amdgpu_ps void @branch(float %arg0, float %arg1) {
+.entry:
+  %i0 = fptosi float %arg0 to i32
+  %i1 = fptosi float %arg1 to i32
+  %c0 = or i32 %i0, %i1
+  %c1 = and i32 %c0, 1
+  %c2 = icmp eq i32 %c1, 0
+  br i1 %c2, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tmp1 = select i1 %c2, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}wqm_demote_1:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec_lo
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]]
+; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]]
+; GCN: ; %.continue
+; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]]
+; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]]
+; GCN: s_cbranch_execnz
+; GCN: exp null
+; GCN: image_sample
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %z.cmp = fcmp olt float %z, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+; GCN-LABEL: {{^}}wqm_demote_2:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: image_sample
+; GCN: ; %.demote
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]]
+; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]]
+; GCN: ; %.continue
+; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]]
+; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]]
+; GCN: s_cbranch_execnz
+; GCN: exp null
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  br i1 %z.cmp, label %.continue, label %.demote
+
+.demote:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue
+
+.continue:
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+; GCN-LABEL: {{^}}wqm_demote_dynamic:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: image_sample
+; GCN: v_cmp_gt_f32_e32 vcc
+; GCN-32-NEXT: s_and_b32 [[LIVE:s[0-9]+]], [[ORIG]], vcc
+; GCN-64-NEXT: s_and_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc
+; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]]
+; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]]
+; GCN: s_cbranch_execnz
+; GCN: exp null
+; GCN: s_endpgm
+; GCN: v_add_f32_e32
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: image_sample
+define amdgpu_ps <4 x float> @wqm_demote_dynamic(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
+.entry:
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+  %tex0 = extractelement <4 x float> %tex, i32 0
+  %tex1 = extractelement <4 x float> %tex, i32 0
+  %z.cmp = fcmp olt float %tex0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %z.cmp)
+  %coord1 = fadd float %tex0, %tex1
+  %rtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord1, <8 x i32> %rsrc, <4 x i32> %sampler, i1 0, i32 0, i32 0) #0
+
+  ret <4 x float> %rtex
+}
+
+
+; GCN-LABEL: {{^}}wqm_deriv:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]]
+; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]]
+; GCN-NOT: s_cbranch_execnz
+; GCN: ; %.continue0
+; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]]
+; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]]
+; GCN: s_cbranch_execnz
+; GCN: exp null
+; GCN: v_cndmask_b32_e64 [[DST:v[0-9]+]], 1.0, 0, [[LIVE]]
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: ; %.demote1
+; GCN-32-NEXT: s_mov_b32 exec_lo, 0
+; GCN-64-NEXT: s_mov_b64 exec, 0
+; GCN: ; %.continue1
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %live = call i1 @llvm.amdgcn.wqm.helper()
+  %live.cond = select i1 %live, i32 0, i32 1065353216
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+; GCN-LABEL: {{^}}wqm_deriv_loop:
+; GCN-NEXT: ; %.entry
+; GCN-32: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
+; GCN-64: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; GCN-32: s_wqm_b32 exec_lo, exec_lo
+; GCN-64: s_wqm_b64 exec, exec
+; GCN: ; %.demote0
+; GCN-32-NEXT: s_andn2_b32 [[LIVE:s[0-9]+]], [[ORIG]], exec
+; GCN-64-NEXT: s_andn2_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]], exec
+; GCN-32-NEXT: s_wqm_b32 [[LIVEWQM0:s[0-9]+]], [[LIVE]]
+; GCN-64-NEXT: s_wqm_b64 [[LIVEWQM0:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM0]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM0]]
+; GCN-NOT: s_cbranch_execnz
+; GCN: ; %.continue0.preheader
+; GCN-32: s_wqm_b32 [[LIVEWQM1:s[0-9]+]], [[LIVE]]
+; GCN-64: s_wqm_b64 [[LIVEWQM1:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
+; GCN-32-NEXT: s_and_b32 exec_lo, exec_lo, [[LIVEWQM1]]
+; GCN-64-NEXT: s_and_b64 exec, exec, [[LIVEWQM1]]
+; GCN: s_cbranch_execnz
+; GCN: exp null
+; GCN: ; %.demote1
+; GCN-32: s_andn2_b32 [[LIVE]], [[LIVE]], exec
+; GCN-64: s_andn2_b64 [[LIVE]], [[LIVE]], exec
+; GCN-NOT: s_cbranch_execnz
+; GCN: ; %.continue1
+; GCN-32: s_or_b32 exec_lo
+; GCN-64: s_or_b64 exec
+; GCN: ; %.continue0
+; PRE-GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:v[0-9]+]], 0, [[LIVE]]
+; GFX10: v_cndmask_b32_e64 [[DST:v[0-9]+]], [[SRC:s[0-9]+]], 0, [[LIVE]]
+; GCN: ; %.return
+; GCN-32: s_and_b32 exec_lo, exec_lo, [[LIVE]]
+; GCN-64: s_and_b64 exec, exec, [[LIVE]]
+; GCN: exp mrt0
+define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index, i32 %limit) {
+.entry:
+  %p0 = extractelement <2 x float> %input, i32 0
+  %p1 = extractelement <2 x float> %input, i32 1
+  %x0 = call float @llvm.amdgcn.interp.p1(float %p0, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %x1 = call float @llvm.amdgcn.interp.p2(float %x0, float %p1, i32 immarg 0, i32 immarg 0, i32 %index) #2
+  %argi = fptosi float %arg to i32
+  %cond0 = icmp eq i32 %argi, 0
+  br i1 %cond0, label %.continue0, label %.demote0
+
+.demote0:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue0
+
+.continue0:
+  %count = phi i32 [ 0, %.entry ], [ 0, %.demote0 ], [ %next, %.continue1 ]
+  %live = call i1 @llvm.amdgcn.wqm.helper()
+  %live.cond = select i1 %live, i32 0, i32 %count
+  %live.v0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 85, i32 15, i32 15, i1 true)
+  %live.v0f = bitcast i32 %live.v0 to float
+  %live.v1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %live.cond, i32 0, i32 15, i32 15, i1 true)
+  %live.v1f = bitcast i32 %live.v1 to float
+  %v0 = fsub float %live.v0f, %live.v1f
+  %v0.wqm = call float @llvm.amdgcn.wqm.f32(float %v0)
+  %cond1 = fcmp oeq float %v0.wqm, 0.000000e+00
+  %cond2 = and i1 %live, %cond1
+  br i1 %cond2, label %.continue1, label %.demote1
+
+.demote1:
+  call void @llvm.amdgcn.wqm.demote(i1 false)
+  br label %.continue1
+
+.continue1:
+  %next = add i32 %count, 1
+  %loop.cond = icmp slt i32 %next, %limit
+  br i1 %loop.cond, label %.continue0, label %.return
+
+.return:
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> <half 0xH3C00, half 0xH0000>, <2 x half> <half 0xH0000, half 0xH3C00>, i1 immarg true, i1 immarg true) #3
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare float @llvm.amdgcn.wqm.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32 immarg, i32 immarg, i32) #2
+declare float @llvm.amdgcn.interp.p2(float, float, i32 immarg, i32 immarg, i32) #2
+declare void @llvm.amdgcn.exp.compr.v2f16(i32 immarg, i32 immarg, <2 x half>, <2 x half>, i1 immarg, i1 immarg) #3
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg) #4
+declare void @llvm.amdgcn.wqm.demote(i1) #5
+declare i1 @llvm.amdgcn.wqm.helper() #6
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { inaccessiblememonly nounwind }
+attributes #4 = { convergent nounwind readnone }
+attributes #5 = { writeonly inaccessiblememonly nounwind }
+attributes #6 = { readonly inaccessiblememonly nounwind }