Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -537,8 +537,9 @@
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
 
+  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+
   addPass(createSIShrinkInstructionsPass());
-  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -12,10 +12,9 @@
 /// shaders.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
-/// with shader side effects (stores and atomics). This pass is run on the
-/// scheduled machine IR but before register coalescing, so that machine SSA is
-/// available for analysis. It ensures that WQM is enabled when necessary, but
-/// disabled around stores and atomics.
+/// with shader side effects (stores and atomics). This pass is run after
+/// machine instruction scheduling but before register allocation. It ensures
+/// that WQM is enabled when necessary, but disabled around stores and atomics.
 ///
 /// When necessary, this pass creates a function prolog
 ///
@@ -57,6 +56,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Format.h"
+
+#include <map>
 
 using namespace llvm;
 
@@ -75,18 +77,35 @@
 };
 
 struct BlockInfo {
-  char Needs = 0;
-  char InNeeds = 0;
-  char OutNeeds = 0;
+  struct NeedsFlags {
+    char Self = 0;
+    char In = 0;
+    char Out = 0;
+  };
+
+  NeedsFlags Needs;
+  NeedsFlags Propagated;
 };
 
 struct WorkItem {
+  struct Value {
+    SlotIndex Slot;
+    unsigned Reg = 0;
+    LaneBitmask LaneMask = 0;
+  };
+
   MachineBasicBlock *MBB = nullptr;
   MachineInstr *MI = nullptr;
+  Value V;
 
   WorkItem() {}
   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
   WorkItem(MachineInstr *MI) : MI(MI) {}
+  WorkItem(SlotIndex Slot, unsigned Reg, unsigned LaneMask) {
+    V.Slot = Slot;
+    V.Reg = Reg;
+    V.LaneMask = LaneMask;
+  }
 };
 
 class SIWholeQuadMode : public MachineFunctionPass {
@@ -94,15 +113,25 @@
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
+  LiveIntervals *LIS;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  std::map<std::pair<SlotIndex, unsigned>, LaneBitmask> WQMValues;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
-  void printInfo();
+  void printInfo(MachineFunction &MF) const;
 
+  void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask,
+                    std::vector<WorkItem> &Worklist);
   void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
+  void markInstruction(MachineInstr &MI, char Flag,
+                       std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask,
+                         const LiveRange &LR, std::vector<WorkItem> &Worklist);
+  void propagateValue(const WorkItem::Value &V,
+                      std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
@@ -128,6 +157,9 @@
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -137,8 +169,11 @@
 
 char SIWholeQuadMode::ID = 0;
 
-INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
-                "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
+                    false)
 
 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
 
@@ -153,39 +188,58 @@
     Str = "WQM";
   if (state & StateExact) {
     if (!Str.empty())
-      Str += "|";
+      Str += '|';
     Str += "Exact";
   }
 
   return Str;
 }
 
-void SIWholeQuadMode::printInfo() {
-  for (const auto &BII : Blocks) {
-    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n";
-    dbgs() << "  InNeeds = " << stateString(BII.second.InNeeds)
-           << ", Needs = " << stateString(BII.second.Needs)
-           << ", OutNeeds = " << stateString(BII.second.OutNeeds) << "\n\n";
+void SIWholeQuadMode::printInfo(MachineFunction &MF) const {
+  for (MachineBasicBlock &MBB : MF) {
+    BlockInfo BI;
+    auto BII = Blocks.find(&MBB);
+    if (BII != Blocks.end())
+      BI = BII->second;
 
-    for (const MachineInstr &MI : *BII.first) {
+    dbgs() << "\nBB#" << MBB.getNumber()
+           << ": In = " << stateString(BI.Needs.In)
+           << ", Self = " << stateString(BI.Needs.Self)
+           << ", Out = " << stateString(BI.Needs.Out) << '\n';
+
+    for (MachineInstr &MI : MBB) {
+      InstrInfo II;
       auto III = Instructions.find(&MI);
-      if (III == Instructions.end())
-        continue;
+      if (III != Instructions.end())
+        II = III->second;
 
-      dbgs() << "  " << MI;
-      dbgs() << "    Needs = " << stateString(III->second.Needs)
-             << ", OutNeeds = " << stateString(III->second.OutNeeds) << "\n";
+      dbgs() << ' ' << left_justify(stateString(II.Needs), 5) << ' ' << MI;
     }
   }
 }
 
+void SIWholeQuadMode::markValueWQM(SlotIndex Slot, unsigned Reg,
+                                   LaneBitmask LaneMask,
+                                   std::vector<WorkItem> &Worklist) {
+  LaneBitmask &Mask = WQMValues[std::make_pair(Slot, Reg)];
+  LaneMask &= ~Mask;
+  if (LaneMask) {
+    Mask |= LaneMask;
+    Worklist.emplace_back(Slot, Reg, LaneMask);
+  }
+}
+
 /// Mark all instructions defining the uses in \p MI as WQM.
 void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
                                   std::vector<WorkItem> &Worklist) {
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
 
+    unsigned Reg = Use.getReg();
+
     // At this point, physical registers appear as (shader) inputs or
     // non-monolithic shader outputs. Following those makes no sense (and would
     // in fact be incorrect when the same VGPR is used as both an output and an
@@ -193,27 +247,49 @@
     //
     // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
     // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+    // pointers where both dwords are followed already anyway. Branch-relevant
+    // code still uses virtual registers at this point.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
-    for (MachineOperand &Def : MRI->def_operands(Use.getReg())) {
-      MachineInstr *DefMI = Def.getParent();
-      InstrInfo &DefII = Instructions[DefMI];
-
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
-        continue;
-
-      DefII.Needs = StateWQM;
-      Worklist.push_back(DefMI);
+    unsigned SubReg = Use.getSubReg();
+    LaneBitmask LaneMask = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+                                  : MRI->getMaxLaneMaskForVReg(Reg);
+
+    markValueWQM(Idx, Reg, LaneMask, Worklist);
+  }
+}
+
+void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
+                                      std::vector<WorkItem> &Worklist) {
+  InstrInfo &II = Instructions[&MI];
+  if (II.Needs == Flag)
+    return;
+
+  assert(!II.Needs);
+  assert(Flag == StateWQM || Flag == StateExact);
+
+  II.Needs = Flag;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  BlockInfo &BI = Blocks[MBB];
+
+  if (!(BI.Needs.Self & Flag)) {
+    BI.Needs.Self |= Flag;
+    BI.Needs.In |= Flag;
+    Worklist.push_back(MBB);
+  }
+
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    InstrInfo &PrevII = Instructions[PrevMI];
+    if (Flag & ~PrevII.OutNeeds) {
+      PrevII.OutNeeds |= Flag;
+      Worklist.push_back(PrevMI);
     }
   }
+
+  if (Flag == StateWQM)
+    markUsesWQM(MI, Worklist);
 }
 
 // Scan instructions to determine which ones require an Exact execmask and
@@ -229,10 +305,10 @@
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
       unsigned Opcode = MI.getOpcode();
-      char Flags = 0;
 
       if (TII->isDS(Opcode)) {
-        Flags = StateWQM;
+        markInstruction(MI, StateWQM, Worklist);
+        GlobalFlags |= StateWQM;
       } else if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
@@ -240,13 +316,13 @@
         markUsesWQM(MI, Worklist);
         GlobalFlags |= StateWQM;
       } else if (MI.mayStore() && TII->usesVM_CNT(MI)) {
-        Flags = StateExact;
+        markInstruction(MI, StateExact, Worklist);
+        GlobalFlags |= StateExact;
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
-          // The function is in machine SSA form, which means that physical
-          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // Physical VGPRs correspond to shader inputs and outputs. Inputs are
           // only used, outputs are only defined.
           for (const MachineOperand &MO : MI.defs()) {
             if (!MO.isReg())
@@ -256,24 +332,20 @@
 
             if (!TRI->isVirtualRegister(Reg) &&
                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
-              Flags = StateWQM;
+              markInstruction(MI, StateWQM, Worklist);
+              GlobalFlags |= StateWQM;
               break;
             }
           }
         }
-
-        if (!Flags)
-          continue;
       }
-
-      Instructions[&MI].Needs = Flags;
-      Worklist.push_back(&MI);
-      GlobalFlags |= Flags;
     }
 
     if (WQMOutputs && MBB.succ_empty()) {
       // This is a prolog shader. Make sure we go back to exact mode at the end.
-      Blocks[&MBB].OutNeeds = StateExact;
+      assert(!Blocks[&MBB].Needs.Out);
+      Blocks[&MBB].Needs.Out = StateExact;
+      Blocks[&MBB].Needs.In |= StateExact;
       Worklist.push_back(&MBB);
       GlobalFlags |= StateExact;
     }
@@ -282,78 +354,118 @@
   return GlobalFlags;
 }
 
-void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
-                                           std::vector<WorkItem>& Worklist) {
-  MachineBasicBlock *MBB = MI.getParent();
-  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
-  BlockInfo &BI = Blocks[MBB];
+/// Helper function of \ref propagateValue that handles individual subranges.
+void SIWholeQuadMode::propagateValueSub(const WorkItem::Value &V,
+                                        LaneBitmask LaneMask,
+                                        const LiveRange &LR,
+                                        std::vector<WorkItem> &Worklist) {
+  const VNInfo *Value = LR.Query(V.Slot).valueIn();
+  if (!Value)
+    return;
 
-  // Control flow-type instructions that are followed by WQM computations
-  // must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) && MI.isTerminator()) {
-    Instructions[&MI].Needs = StateWQM;
-    II.Needs = StateWQM;
+  if (Value->isPHIDef()) {
+    MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
+
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
+      SlotIndex PredIndex = LIS->getMBBEndIdx(Pred).getPrevIndex();
+
+      markValueWQM(PredIndex, V.Reg, LaneMask, Worklist);
+    }
+  } else {
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def);
+
+    // Obviously skip if DefMI is already flagged as NeedWQM.
+    //
+    // The instruction might also be flagged as NeedExact. This happens when
+    // the result of an atomic is used in a WQM computation. In this case,
+    // the atomic must not run for helper pixels and the WQM result is
+    // undefined.
+    if (!Instructions[DefMI].Needs)
+      markInstruction(*DefMI, StateWQM, Worklist);
   }
+}
 
-  // Propagate to block level
-  BI.Needs |= II.Needs;
-  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
-    BI.InNeeds |= II.Needs;
-    Worklist.push_back(MBB);
+void SIWholeQuadMode::propagateValue(const WorkItem::Value &V,
+                                     std::vector<WorkItem> &Worklist) {
+  const LiveInterval &LI = LIS->getInterval(V.Reg);
+  LaneBitmask LaneMask = V.LaneMask;
+
+  if (LI.hasSubRanges()) {
+    for (const LiveInterval::SubRange &S : LI.subranges()) {
+      LaneBitmask Common = LaneMask & S.LaneMask;
+
+      if (Common) {
+        LaneMask &= ~Common;
+        propagateValueSub(V, Common, S, Worklist);
+      }
+    }
+  } else {
+    propagateValueSub(V, LaneMask, LI, Worklist);
   }
+}
+
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+                                           std::vector<WorkItem> &Worklist) {
+  const InstrInfo II = Instructions[&MI];
 
   // Propagate backwards within block
   if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
-    if (!PrevMI->isPHI()) {
-      InstrInfo &PrevII = Instructions[PrevMI];
-      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
-        PrevII.OutNeeds |= InNeeds;
-        Worklist.push_back(PrevMI);
-      }
+    InstrInfo &PrevII = Instructions[PrevMI];
+    if (II.OutNeeds & ~PrevII.OutNeeds) {
+      PrevII.OutNeeds |= II.OutNeeds;
+      Worklist.push_back(PrevMI);
     }
   }
 
-  // Propagate WQM flag to instruction inputs
-  assert(II.Needs != (StateWQM | StateExact));
-  if (II.Needs == StateWQM)
-    markUsesWQM(MI, Worklist);
+  if (MI.getOpcode() == AMDGPU::SI_KILL && II.OutNeeds & StateWQM)
+    markInstruction(MI, StateWQM, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
-                                     std::vector<WorkItem>& Worklist) {
+                                     std::vector<WorkItem> &Worklist) {
   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 
   // Propagate through instructions
   if (!MBB.empty()) {
     MachineInstr *LastMI = &*MBB.rbegin();
     InstrInfo &LastII = Instructions[LastMI];
-    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
-      LastII.OutNeeds |= BI.OutNeeds;
+    if (BI.Needs.Out & ~LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.Needs.Out;
       Worklist.push_back(LastMI);
     }
   }
 
   // Predecessor blocks must provide for our WQM/Exact needs.
-  for (MachineBasicBlock *Pred : MBB.predecessors()) {
-    BlockInfo &PredBI = Blocks[Pred];
-    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
-      continue;
-
-    PredBI.OutNeeds |= BI.InNeeds;
-    PredBI.InNeeds |= BI.InNeeds;
-    Worklist.push_back(Pred);
+  if (BI.Needs.In & ~BI.Propagated.In) {
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      BlockInfo &PredBI = Blocks[Pred];
+      if (BI.Needs.In & ~PredBI.Needs.Out) {
+        PredBI.Needs.Out |= BI.Needs.In;
+        PredBI.Needs.In |= BI.Needs.In;
+        Worklist.push_back(Pred);
+      }
+    }
   }
 
-  // All successors must be prepared to accept the same set of WQM/Exact data.
-  for (MachineBasicBlock *Succ : MBB.successors()) {
-    BlockInfo &SuccBI = Blocks[Succ];
-    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
-      continue;
+  if (BI.Needs.Out & ~BI.Propagated.Out) {
+    // All successors must be prepared to accept the same set of WQM/Exact
+    // data.
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      BlockInfo &SuccBI = Blocks[Succ];
+      if (BI.Needs.Out & ~SuccBI.Needs.In) {
+        SuccBI.Needs.In |= BI.Needs.Out;
+        Worklist.push_back(Succ);
+      }
+    }
 
-    SuccBI.InNeeds |= BI.OutNeeds;
-    Worklist.push_back(Succ);
+    // Mark terminators as WQM if required
+    if (BI.Needs.Out & ~BI.Propagated.Out & StateWQM) {
+      for (MachineInstr &Terminator : MBB.terminators())
+        markInstruction(Terminator, StateWQM, Worklist);
+    }
   }
+
+  Blocks[&MBB].Propagated = BI.Needs;
 }
 
 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
@@ -366,8 +478,10 @@
 
     if (WI.MI)
       propagateInstruction(*WI.MI, Worklist);
-    else
+    else if (WI.MBB)
       propagateBlock(*WI.MBB, Worklist);
+    else
+      propagateValue(WI.V, Worklist);
   }
 
   return GlobalFlags;
@@ -376,29 +490,37 @@
 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
                               unsigned SaveWQM, unsigned LiveMaskReg) {
+  MachineInstr *MI;
+
   if (SaveWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
-            SaveWQM)
-        .addReg(LiveMaskReg);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+                 SaveWQM)
+             .addReg(LiveMaskReg);
   } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
-            AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC)
-        .addReg(LiveMaskReg);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC)
+             .addReg(LiveMaskReg);
   }
+
+  LIS->InsertMachineInstrInMaps(*MI);
 }
 
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator Before,
                             unsigned SavedWQM) {
+  MachineInstr *MI;
+
   if (SavedWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
-        .addReg(SavedWQM);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+             .addReg(SavedWQM);
   } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC);
   }
+
+  LIS->InsertMachineInstrInMaps(*MI);
 }
 
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
@@ -409,12 +531,12 @@
 
   const BlockInfo &BI = BII->second;
 
-  if (!(BI.InNeeds & StateWQM))
+  if (!(BI.Needs.In & StateWQM))
     return;
 
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && !(BI.Needs.Self & StateExact) && BI.Needs.Out != StateExact)
     return;
 
   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
@@ -477,17 +599,24 @@
       } else {
         assert(WQMFromExec == (SavedWQMReg == 0));
         toWQM(MBB, &MI, SavedWQMReg);
-        SavedWQMReg = 0;
+
+        if (SavedWQMReg) {
+          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+          SavedWQMReg = 0;
+        }
       }
 
       State = Needs;
     }
   }
 
-  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+  if ((BI.Needs.Out & StateWQM) && State != StateWQM) {
     assert(WQMFromExec == (SavedWQMReg == 0));
     toWQM(MBB, MBB.end(), SavedWQMReg);
-  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+
+    if (SavedWQMReg)
+      LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+  } else if (BI.Needs.Out == StateExact && State != StateExact) {
     toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
                              : MBB.getFirstTerminator(),
             0, LiveMaskReg);
@@ -498,8 +627,11 @@
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     unsigned Dest = MI->getOperand(0).getReg();
-    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
-        .addReg(LiveMaskReg);
+    MachineInstr *NewMI =
+        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+            .addReg(LiveMaskReg);
+    LIS->ReplaceMachineInstrInMaps(*MI, *NewMI);
+
     MI->eraseFromParent();
   }
 }
@@ -509,6 +641,7 @@
     return false;
 
   Instructions.clear();
+  WQMValues.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
 
@@ -517,6 +650,7 @@
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
+  LIS = &getAnalysis<LiveIntervals>();
 
   char GlobalFlags = analyzeFunction(MF);
   if (!(GlobalFlags & StateWQM)) {
@@ -524,6 +658,8 @@
     return !LiveMaskQueries.empty();
   }
 
+  DEBUG(printInfo(MF));
+
   // Store a copy of the original live mask when required
   unsigned LiveMaskReg = 0;
   {
@@ -532,29 +668,32 @@
 
     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
-          .addReg(AMDGPU::EXEC);
+      MachineInstr *MI =
+          BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+              .addReg(AMDGPU::EXEC);
+      LIS->InsertMachineInstrInMaps(*MI);
     }
 
     if (GlobalFlags == StateWQM) {
       // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-              AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC);
-
-      lowerLiveMaskQueries(LiveMaskReg);
-      // EntryMI may become invalid here
-      return true;
+      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                                TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
+                            .addReg(AMDGPU::EXEC);
+      LIS->InsertMachineInstrInMaps(*MI);
     }
   }
 
-  DEBUG(printInfo());
-
   lowerLiveMaskQueries(LiveMaskReg);
 
-  // Handle the general case
-  for (auto BII : Blocks)
-    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  if (GlobalFlags != StateWQM) {
+    // Handle the general case
+    for (auto &BII : Blocks)
+      processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
+                   BII.first == &*MF.begin());
+  }
+
+  if (LiveMaskReg)
+    LIS->createAndComputeVirtRegInterval(LiveMaskReg);
 
   return true;
 }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}test1:
-; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, [[LIVE]]
 ;
 ; Note: We could generate better code here if we recognized earlier that
 ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -358,13 +358,77 @@
   ret float %s
 }
 
+; CHECK-LABEL: {{^}}test_subregs:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_interp_p1_f32
+; CHECK: v_interp_p2_f32
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: _store
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: image_sample
+;
+; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires
+; tracking of subregisters.
+;
+define amdgpu_ps <4 x float> @test_subregs(float addrspace(1)* inreg %ptr, i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 {
+main_body:
+  %c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij)
+
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+  store float 1.0, float addrspace(1)* %gep
+
+  %c.i = bitcast float %c to i32
+  %c2.0 = insertelement <2 x i32> undef, i32 %c.i, i32 0
+  %c2.i = insertelement <2 x i32> %c2.0, i32 1, i32 1
+  %tex = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %c2.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Test tracking of vector condition codes.
+;
+; CHECK-LABEL: {{^}}test_vcc_tracking:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_cmp_gt_i32_e32 vcc,
+; CHECK: ; %else
+; CHECK: image_sample
+; CHECK: ; %if
+; CHECK: image_sample
+; CHECK: ; %end
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: _store
+define amdgpu_ps <4 x float> @test_vcc_tracking(float addrspace(1)* inreg %ptr, i32 %sel, i32 %idx) #1 {
+main_body:
+  %cc = icmp sgt i32 %sel, 0
+  br i1 %cc, label %if, label %else
+
+if:
+  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  br label %end
+
+else:
+  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  br label %end
+
+end:
+  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
+
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+  store float 1.0, float addrspace(1)* %gep
+
+  ret <4 x float> %r
+}
+
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
 
 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #3
 declare void @llvm.AMDGPU.kill(float)
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)