Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -555,8 +555,9 @@
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
 
+  insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
+
   addPass(createSIShrinkInstructionsPass());
-  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -12,10 +12,9 @@
 /// shaders.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
-/// with shader side effects (stores and atomics). This pass is run on the
-/// scheduled machine IR but before register coalescing, so that machine SSA is
-/// available for analysis. It ensures that WQM is enabled when necessary, but
-/// disabled around stores and atomics.
+/// with shader side effects (stores and atomics). This pass is run after
+/// machine instruction scheduling but before register allocation. It ensures
+/// that WQM is enabled when necessary, but disabled around stores and atomics.
 ///
 /// When necessary, this pass creates a function prolog
 ///
@@ -57,6 +56,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Format.h"
+
+#include <map>
 
 using namespace llvm;
 
@@ -71,9 +73,11 @@
 
 struct PrintState {
 public:
-  explicit PrintState(int State) : State(State) {}
+  explicit PrintState(int State, bool Align = false)
+      : State(State), Align(Align) {}
 
   int State;
+  bool Align;
 };
 
 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
@@ -85,6 +89,13 @@
     OS << "Exact";
   }
 
+  if (PS.Align) {
+    if (!PS.State)
+      OS << "     ";
+    else if (!(PS.State & StateExact))
+      OS << "  ";
+  }
+
   return OS;
 }
 
@@ -94,18 +105,35 @@
 };
 
 struct BlockInfo {
-  char Needs = 0;
-  char InNeeds = 0;
-  char OutNeeds = 0;
+  struct NeedsFlags {
+    char Self = 0;
+    char In = 0;
+    char Out = 0;
+  };
+
+  NeedsFlags Needs;
+  NeedsFlags Propagated;
 };
 
 struct WorkItem {
+  struct Value {
+    SlotIndex Slot;
+    unsigned Reg = 0;
+    LaneBitmask LaneMask = 0;
+  };
+
   MachineBasicBlock *MBB = nullptr;
   MachineInstr *MI = nullptr;
+  Value V;
 
   WorkItem() {}
   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
   WorkItem(MachineInstr *MI) : MI(MI) {}
+  WorkItem(SlotIndex Slot, unsigned Reg, unsigned LaneMask) {
+    V.Slot = Slot;
+    V.Reg = Reg;
+    V.LaneMask = LaneMask;
+  }
 };
 
 class SIWholeQuadMode : public MachineFunctionPass {
@@ -117,14 +145,21 @@
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  std::map<std::pair<SlotIndex, unsigned>, LaneBitmask> WQMValues;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
-  void printInfo();
+  void printInfo(MachineFunction &MF) const;
 
+  void markValueWQM(SlotIndex Slot, unsigned Reg, LaneBitmask LaneMask,
+                    std::vector<WorkItem> &Worklist);
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
   void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
+  void propagateValueSub(const WorkItem::Value &V, LaneBitmask LaneMask,
+                         const LiveRange &LR, std::vector<WorkItem> &Worklist);
+  void propagateValue(const WorkItem::Value &V,
+                      std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
@@ -151,6 +186,8 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreserved<SlotIndexes>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -172,24 +209,40 @@
   return new SIWholeQuadMode;
 }
 
-void SIWholeQuadMode::printInfo() {
-  for (const auto &BII : Blocks) {
-    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n"
-           << "  InNeeds = " << PrintState(BII.second.InNeeds)
-           << ", Needs = " << PrintState(BII.second.Needs)
-           << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
+void SIWholeQuadMode::printInfo(MachineFunction &MF) const {
+  for (MachineBasicBlock &MBB : MF) {
+    BlockInfo BI;
+    auto BII = Blocks.find(&MBB);
+    if (BII != Blocks.end())
+      BI = BII->second;
+
+    dbgs() << "\nBB#" << MBB.getNumber() << ":\n"
+           << "  In = " << PrintState(BI.Needs.In)
+           << ", Self = " << PrintState(BI.Needs.Self)
+           << ", Out = " << PrintState(BI.Needs.Out) << '\n';
 
-    for (const MachineInstr &MI : *BII.first) {
+    for (MachineInstr &MI : MBB) {
+      InstrInfo II;
       auto III = Instructions.find(&MI);
-      if (III == Instructions.end())
-        continue;
+      if (III != Instructions.end())
+        II = III->second;
 
-      dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
-             << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
+      dbgs() << ' ' << PrintState(II.Needs, true) << ' ' << MI;
     }
   }
 }
 
+void SIWholeQuadMode::markValueWQM(SlotIndex Slot, unsigned Reg,
+                                   LaneBitmask LaneMask,
+                                   std::vector<WorkItem> &Worklist) {
+  LaneBitmask &Mask = WQMValues[std::make_pair(Slot, Reg)];
+  LaneMask &= ~Mask;
+  if (LaneMask) {
+    Mask |= LaneMask;
+    Worklist.emplace_back(Slot, Reg, LaneMask);
+  }
+}
+
 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
@@ -204,12 +257,33 @@
     return;
 
   II.Needs = Flag;
-  Worklist.push_back(&MI);
+
+  MachineBasicBlock *MBB = MI.getParent();
+  BlockInfo &BI = Blocks[MBB];
+
+  if (!(BI.Needs.Self & Flag)) {
+    BI.Needs.Self |= Flag;
+    BI.Needs.In |= Flag;
+    Worklist.push_back(MBB);
+  }
+
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    InstrInfo &PrevII = Instructions[PrevMI];
+    if (Flag & ~PrevII.OutNeeds) {
+      PrevII.OutNeeds |= Flag;
+      Worklist.push_back(PrevMI);
+    }
+  }
+
+  if (Flag == StateWQM)
+    markUsesWQM(MI, Worklist);
 }
 
 /// Mark all instructions defining the uses in \p MI as WQM.
 void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
                                   std::vector<WorkItem> &Worklist) {
+  SlotIndex Idx = LIS->getInstructionIndex(MI);
+
   for (const MachineOperand &Use : MI.uses()) {
     if (!Use.isReg() || !Use.isUse())
       continue;
@@ -229,8 +303,8 @@
         if (!Value)
           continue;
 
-        // Since we're in machine SSA, we do not need to track physical
-        // registers across basic blocks.
+        // We do not need to track physical registers across basic blocks
+        // before register allocation
         if (Value->isPHIDef())
           continue;
 
@@ -241,8 +315,11 @@
       continue;
     }
 
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, StateWQM, Worklist);
+    unsigned SubReg = Use.getSubReg();
+    LaneBitmask LaneMask = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+                                  : MRI->getMaxLaneMaskForVReg(Reg);
+
+    markValueWQM(Idx, Reg, LaneMask, Worklist);
   }
 }
 
@@ -259,25 +336,24 @@
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
       unsigned Opcode = MI.getOpcode();
-      char Flags = 0;
 
       if (TII->isDS(Opcode)) {
-        Flags = StateWQM;
+        markInstruction(MI, StateWQM, Worklist);
+        GlobalFlags |= StateWQM;
       } else if (TII->isWQM(Opcode)) {
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
         markUsesWQM(MI, Worklist);
         GlobalFlags |= StateWQM;
-        continue;
       } else if (TII->isDisableWQM(MI)) {
-        Flags = StateExact;
+        markInstruction(MI, StateExact, Worklist);
+        GlobalFlags |= StateExact;
       } else {
         if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
-          // The function is in machine SSA form, which means that physical
-          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // Physical VGPRs correspond to shader inputs and outputs. Inputs are
           // only used, outputs are only defined.
           for (const MachineOperand &MO : MI.defs()) {
             if (!MO.isReg())
@@ -287,98 +363,128 @@
 
             if (!TRI->isVirtualRegister(Reg) &&
                 TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
-              Flags = StateWQM;
+              markInstruction(MI, StateWQM, Worklist);
+              GlobalFlags |= StateWQM;
               break;
             }
           }
         }
-
-        if (!Flags)
-          continue;
       }
-
-      markInstruction(MI, Flags, Worklist);
-      GlobalFlags |= Flags;
     }
   }
 
   return GlobalFlags;
 }
 
-void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
-                                           std::vector<WorkItem>& Worklist) {
-  MachineBasicBlock *MBB = MI.getParent();
-  InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
-  BlockInfo &BI = Blocks[MBB];
+/// Helper function of \ref propagateValue that handles individual subranges.
+void SIWholeQuadMode::propagateValueSub(const WorkItem::Value &V,
+                                        LaneBitmask LaneMask,
+                                        const LiveRange &LR,
+                                        std::vector<WorkItem> &Worklist) {
+  const VNInfo *Value = LR.Query(V.Slot).valueIn();
+  if (!Value)
+    return;
 
-  // Control flow-type instructions and stores to temporary memory that are
-  // followed by WQM computations must themselves be in WQM.
-  if ((II.OutNeeds & StateWQM) && !II.Needs &&
-      (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
-    Instructions[&MI].Needs = StateWQM;
-    II.Needs = StateWQM;
-  }
+  if (Value->isPHIDef()) {
+    MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
 
-  // Propagate to block level
-  BI.Needs |= II.Needs;
-  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
-    BI.InNeeds |= II.Needs;
-    Worklist.push_back(MBB);
+    for (MachineBasicBlock *Pred : MBB->predecessors()) {
+      SlotIndex PredIndex = LIS->getMBBEndIdx(Pred).getPrevIndex();
+
+      markValueWQM(PredIndex, V.Reg, LaneMask, Worklist);
+    }
+  } else {
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(Value->def);
+
+    markInstruction(*DefMI, StateWQM, Worklist);
   }
+}
 
-  // Propagate backwards within block
-  if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
-    if (!PrevMI->isPHI()) {
-      InstrInfo &PrevII = Instructions[PrevMI];
-      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
-        PrevII.OutNeeds |= InNeeds;
-        Worklist.push_back(PrevMI);
+void SIWholeQuadMode::propagateValue(const WorkItem::Value &V,
+                                     std::vector<WorkItem> &Worklist) {
+  const LiveInterval &LI = LIS->getInterval(V.Reg);
+  LaneBitmask LaneMask = V.LaneMask;
+
+  if (LI.hasSubRanges()) {
+    for (const LiveInterval::SubRange &S : LI.subranges()) {
+      LaneBitmask Common = LaneMask & S.LaneMask;
+
+      if (Common) {
+        LaneMask &= ~Common;
+        propagateValueSub(V, Common, S, Worklist);
       }
     }
+  } else {
+    propagateValueSub(V, LaneMask, LI, Worklist);
   }
+}
 
-  // Propagate WQM flag to instruction inputs
-  assert(II.Needs != (StateWQM | StateExact));
+// Backwards propagation of OutNeeds and related effects.
+void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
+                                           std::vector<WorkItem> &Worklist) {
+  const InstrInfo II = Instructions[&MI];
 
-  if (II.Needs == StateWQM)
-    markUsesWQM(MI, Worklist);
+  // Stores to temporary memory (i.e., not marked as Exact) may be relevant for
+  // WQM computations.
+  if (!II.Needs && (II.OutNeeds & StateWQM) &&
+      (TII->usesVM_CNT(MI) && MI.mayStore()))
+    markInstruction(MI, StateWQM, Worklist);
+
+  // Propagate backwards within block
+  if (MachineInstr *PrevMI = MI.getPrevNode()) {
+    InstrInfo &PrevII = Instructions[PrevMI];
+    if (II.OutNeeds & ~PrevII.OutNeeds) {
+      PrevII.OutNeeds |= II.OutNeeds;
+      Worklist.push_back(PrevMI);
+    }
+  }
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
-                                     std::vector<WorkItem>& Worklist) {
+                                     std::vector<WorkItem> &Worklist) {
   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
 
   // Propagate through instructions
   if (!MBB.empty()) {
     MachineInstr *LastMI = &*MBB.rbegin();
     InstrInfo &LastII = Instructions[LastMI];
-    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
-      LastII.OutNeeds |= BI.OutNeeds;
+    if (BI.Needs.Out & ~LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.Needs.Out;
       Worklist.push_back(LastMI);
     }
   }
 
   // Predecessor blocks must provide for our WQM/Exact needs.
-  for (MachineBasicBlock *Pred : MBB.predecessors()) {
-    BlockInfo &PredBI = Blocks[Pred];
-    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
-      continue;
-
-    PredBI.OutNeeds |= BI.InNeeds;
-    PredBI.InNeeds |= BI.InNeeds;
-    Worklist.push_back(Pred);
+  if (BI.Needs.In & ~BI.Propagated.In) {
+    for (MachineBasicBlock *Pred : MBB.predecessors()) {
+      BlockInfo &PredBI = Blocks[Pred];
+      if (BI.Needs.In & ~PredBI.Needs.Out) {
+        PredBI.Needs.Out |= BI.Needs.In;
+        PredBI.Needs.In |= BI.Needs.In;
+        Worklist.push_back(Pred);
+      }
+    }
   }
 
-  // All successors must be prepared to accept the same set of WQM/Exact data.
-  for (MachineBasicBlock *Succ : MBB.successors()) {
-    BlockInfo &SuccBI = Blocks[Succ];
-    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
-      continue;
+  if (BI.Needs.Out & ~BI.Propagated.Out) {
+    // All successors must be prepared to accept the same set of WQM/Exact
+    // data.
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      BlockInfo &SuccBI = Blocks[Succ];
+      if (BI.Needs.Out & ~SuccBI.Needs.In) {
+        SuccBI.Needs.In |= BI.Needs.Out;
+        Worklist.push_back(Succ);
+      }
+    }
 
-    SuccBI.InNeeds |= BI.OutNeeds;
-    Worklist.push_back(Succ);
+    // Mark terminators as WQM if required
+    if (BI.Needs.Out & ~BI.Propagated.Out & StateWQM) {
+      for (MachineInstr &Terminator : MBB.terminators())
+        markInstruction(Terminator, StateWQM, Worklist);
+    }
   }
+
+  Blocks[&MBB].Propagated = BI.Needs;
 }
 
 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
@@ -391,8 +497,10 @@
 
     if (WI.MI)
       propagateInstruction(*WI.MI, Worklist);
-    else
+    else if (WI.MBB)
       propagateBlock(*WI.MBB, Worklist);
+    else
+      propagateValue(WI.V, Worklist);
   }
 
   return GlobalFlags;
@@ -401,29 +509,37 @@
 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator Before,
                               unsigned SaveWQM, unsigned LiveMaskReg) {
+  MachineInstr *MI;
+
   if (SaveWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
-            SaveWQM)
-        .addReg(LiveMaskReg);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+                 SaveWQM)
+             .addReg(LiveMaskReg);
   } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
-            AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC)
-        .addReg(LiveMaskReg);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC)
+             .addReg(LiveMaskReg);
   }
+
+  LIS->InsertMachineInstrInMaps(*MI);
 }
 
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator Before,
                             unsigned SavedWQM) {
+  MachineInstr *MI;
+
   if (SavedWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
-        .addReg(SavedWQM);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+             .addReg(SavedWQM);
   } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC);
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+                 AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC);
   }
+
+  LIS->InsertMachineInstrInMaps(*MI);
 }
 
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
@@ -434,12 +550,12 @@
 
   const BlockInfo &BI = BII->second;
 
-  if (!(BI.InNeeds & StateWQM))
+  if (!(BI.Needs.In & StateWQM))
     return;
 
   // This is a non-entry block that is WQM throughout, so no need to do
   // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && !(BI.Needs.Self & StateExact) && BI.Needs.Out != StateExact)
     return;
 
   DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
@@ -502,20 +618,27 @@
       } else {
         assert(WQMFromExec == (SavedWQMReg == 0));
         toWQM(MBB, &MI, SavedWQMReg);
-        SavedWQMReg = 0;
+
+        if (SavedWQMReg) {
+          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+          SavedWQMReg = 0;
+        }
       }
 
       State = Needs;
     }
 
-    if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
+    if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.Needs.Out == StateExact)
       MI.getOperand(3).setImm(1);
   }
 
-  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+  if ((BI.Needs.Out & StateWQM) && State != StateWQM) {
     assert(WQMFromExec == (SavedWQMReg == 0));
     toWQM(MBB, MBB.end(), SavedWQMReg);
-  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+
+    if (SavedWQMReg)
+      LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+  } else if (BI.Needs.Out == StateExact && State != StateExact) {
     toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
                              : MBB.getFirstTerminator(),
             0, LiveMaskReg);
@@ -526,8 +649,11 @@
   for (MachineInstr *MI : LiveMaskQueries) {
     const DebugLoc &DL = MI->getDebugLoc();
     unsigned Dest = MI->getOperand(0).getReg();
-    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
-        .addReg(LiveMaskReg);
+    MachineInstr *NewMI =
+        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+            .addReg(LiveMaskReg);
+    LIS->ReplaceMachineInstrInMaps(*MI, *NewMI);
+
     MI->eraseFromParent();
   }
 }
@@ -537,6 +663,7 @@
     return false;
 
   Instructions.clear();
+  WQMValues.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
 
@@ -553,6 +680,8 @@
     return !LiveMaskQueries.empty();
   }
 
+  DEBUG(printInfo(MF));
+
   // Store a copy of the original live mask when required
   unsigned LiveMaskReg = 0;
   {
@@ -561,29 +690,31 @@
 
     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
       LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
-          .addReg(AMDGPU::EXEC);
+      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                                 TII->get(AMDGPU::COPY), LiveMaskReg)
+                             .addReg(AMDGPU::EXEC);
+      LIS->InsertMachineInstrInMaps(*MI);
     }
 
     if (GlobalFlags == StateWQM) {
       // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-              AMDGPU::EXEC)
-          .addReg(AMDGPU::EXEC);
-
-      lowerLiveMaskQueries(LiveMaskReg);
-      // EntryMI may become invalid here
-      return true;
+      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                                 TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
+                             .addReg(AMDGPU::EXEC);
+      LIS->InsertMachineInstrInMaps(*MI);
     }
   }
 
-  DEBUG(printInfo());
-
   lowerLiveMaskQueries(LiveMaskReg);
 
-  // Handle the general case
-  for (auto BII : Blocks)
-    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  if (GlobalFlags != StateWQM) {
+    // Handle the general case
+    for (auto &BII : Blocks)
+      processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
+  }
+
+  if (LiveMaskReg)
+    LIS->createAndComputeVirtRegInterval(LiveMaskReg);
 
   return true;
 }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}test1:
-; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: v_cndmask_b32_e64 v0, 0, 1, [[LIVE]]
 ;
 ; Note: We could generate better code here if we recognized earlier that
 ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
@@ -16,8 +17,9 @@
 
 ; CHECK-LABEL: {{^}}test2:
 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
 ; CHECK-DAG: s_wqm_b64 exec, exec
-; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
+; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]]
 ; CHECK: image_sample v0, [[VAR]],
 define amdgpu_ps float @test2() {
   %live = call i1 @llvm.amdgcn.ps.live()
@@ -31,8 +33,9 @@
 
 ; CHECK-LABEL: {{^}}test3:
 ; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK-DAG: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[LIVE]]
 ; CHECK-DAG: s_wqm_b64 exec, exec
-; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
+; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[COPY]], -1
 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
 ; CHECK: ; %dead
 define amdgpu_ps float @test3(i32 %in) {
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -464,6 +464,67 @@
   ret <4 x float> %dtex
 }
 
+; CHECK-LABEL: {{^}}test_subregs:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_interp_p1_f32
+; CHECK: v_interp_p2_f32
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: _store
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: image_sample
+;
+; Early coalescing merges %c into a 64 bit VGPR pair, so correctness requires
+; tracking of subregisters.
+;
+define amdgpu_ps <4 x float> @test_subregs(i32 inreg %prims, <2 x i32> %ij, i32 %idx) #1 {
+main_body:
+  %c = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prims, <2 x i32> %ij)
+
+  call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+
+  %c.i = bitcast float %c to i32
+  %c2.0 = insertelement <2 x i32> undef, i32 %c.i, i32 0
+  %c2.i = insertelement <2 x i32> %c2.0, i32 1, i32 1
+  %tex = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %c2.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Test tracking of vector condition codes.
+;
+; CHECK-LABEL: {{^}}test_vcc_tracking:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_cmp_gt_i32_e32 vcc,
+; CHECK: ; %else
+; CHECK: image_sample
+; CHECK: ; %if
+; CHECK: image_sample
+; CHECK: ; %end
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: _store
+define amdgpu_ps <4 x float> @test_vcc_tracking(i32 %sel, i32 %idx) #1 {
+main_body:
+  %cc = icmp sgt i32 %sel, 0
+  br i1 %cc, label %if, label %else
+
+if:
+  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  br label %end
+
+else:
+  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  br label %end
+
+end:
+  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
+
+  call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+
+  ret <4 x float> %r
+}
+
+
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
@@ -472,8 +533,10 @@
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
 
 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #3
 declare void @llvm.AMDGPU.kill(float)
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)