Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
@@ -44,6 +44,7 @@
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
+FunctionPass *createSIWholeQuadModePass();
 FunctionPass *createSILowerControlFlowPass();
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
@@ -70,6 +71,9 @@
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
+void initializeSIWholeQuadModePass(PassRegistry &);
+extern char &SIWholeQuadModeID;
+
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowPassID;
 
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -62,7 +62,6 @@
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-
   /// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
   /// Return -1 if the target-specific opcode for the pseudo instruction does
   /// not exist. If Opcode is not a pseudo instruction, this is identity.
Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -57,6 +57,7 @@
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertNopsPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
 }
 
@@ -346,6 +347,7 @@
     insertPass(&MachineSchedulerID, &RegisterCoalescerID);
   }
   addPass(createSIShrinkInstructionsPass(), false);
+  addPass(createSIWholeQuadModePass());
 }
 
 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
+  SIWholeQuadMode.cpp
   )
 
 add_subdirectory(AsmParser)
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
@@ -149,6 +149,10 @@
                                       MachineBasicBlock::iterator &MI,
                                       LiveVariables *LV) const override;
 
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
+
   static bool isSALU(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SALU;
   }
Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1248,6 +1248,19 @@
                  .addImm(0); // omod
 }
 
+bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       const MachineFunction &MF) const {
+  // Target-independent instructions do not have an implicit-use of EXEC, even
+  // when they operate on VGPRs. Treating EXEC modifications as scheduling
+  // boundaries prevents incorrect movements of such instructions.
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
+    return true;
+
+  return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
+}
+
 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
   int64_t SVal = Imm.getSExtValue();
   if (SVal >= -16 && SVal <= 64)
Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -78,7 +78,7 @@
   void SkipIfDead(MachineInstr &MI);
 
   void If(MachineInstr &MI);
-  void Else(MachineInstr &MI);
+  void Else(MachineInstr &MI, bool ExecModified);
   void Break(MachineInstr &MI);
   void IfBreak(MachineInstr &MI);
   void ElseBreak(MachineInstr &MI);
@@ -215,7 +215,7 @@
   MI.eraseFromParent();
 }
 
-void SILowerControlFlow::Else(MachineInstr &MI) {
+void SILowerControlFlow::Else(MachineInstr &MI, bool ExecModified) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
   unsigned Dst = MI.getOperand(0).getReg();
@@ -225,6 +225,15 @@
           TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
           .addReg(Src); // Saved EXEC
 
+  if (ExecModified) {
+    // Adjust the saved exec to account for the modifications during the flow
+    // block that contains the ELSE. This can happen when WQM mode is switched
+    // off.
+    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
+            .addReg(AMDGPU::EXEC)
+            .addReg(Dst);
+  }
+
   BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
           .addReg(AMDGPU::EXEC)
           .addReg(Dst);
@@ -488,7 +497,6 @@
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
-  bool NeedWQM = false;
   bool NeedFlat = false;
   unsigned Depth = 0;
 
@@ -498,17 +506,24 @@
     MachineBasicBlock *EmptyMBBAtEnd = NULL;
     MachineBasicBlock &MBB = *BI;
     MachineBasicBlock::iterator I, Next;
+    bool ExecModified = false;
+
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
 
       MachineInstr &MI = *I;
-      if (TII->isWQM(MI) || TII->isDS(MI))
-        NeedWQM = true;
 
       // Flat uses m0 in case it needs to access LDS.
       if (TII->isFLAT(MI))
         NeedFlat = true;
 
+      for (const auto &Def : I->defs()) {
+        if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) {
+          ExecModified = true;
+          break;
+        }
+      }
+
       switch (MI.getOpcode()) {
         default: break;
         case AMDGPU::SI_IF:
@@ -517,7 +532,7 @@
           break;
 
         case AMDGPU::SI_ELSE:
-          Else(MI);
+          Else(MI, ExecModified);
           break;
 
         case AMDGPU::SI_BREAK:
@@ -599,12 +614,6 @@
     }
   }
 
-  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
-    MachineBasicBlock &MBB = MF.front();
-    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
-  }
-
   if (NeedFlat && MFI->IsKernel) {
     // TODO: What to use with function calls?
     // We will need to Initialize the flat scratch register pair.
Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -72,9 +72,12 @@
   }
 
   bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+    const TargetRegisterClass *RC;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
-      return isSGPRClass(MRI.getRegClass(Reg));
-    return getPhysRegClass(Reg);
+      RC = MRI.getRegClass(Reg);
+    else
+      RC = getPhysRegClass(Reg);
+    return isSGPRClass(RC);
   }
 
   /// \returns true if this class contains VGPR registers.
Index: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -0,0 +1,465 @@
+//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass adds instructions to enable whole quad mode for pixel
+/// shaders.
+///
+/// Whole quad mode is required for derivative computations, but it interferes
+/// with shader side effects (stores and atomics). This pass is run on the
+/// scheduled machine IR but before register coalescing, so that machine SSA is
+/// available for analysis. It ensures that WQM is enabled when necessary, but
+/// disabled around stores and atomics.
+///
+/// When necessary, this pass creates a function prolog
+///
+///   S_MOV_B64 LiveMask, EXEC
+///   S_WQM_B64 EXEC, EXEC
+///
+/// to enter WQM at the top of the function and surrounds blocks of Exact
+/// instructions by
+///
+///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
+///   ...
+///   S_MOV_B64 EXEC, Tmp
+///
+/// In order to avoid excessive switching during sequences of Exact
+/// instructions, the pass first analyzes which instructions must be run in WQM
+/// (aka which instructions produce values that lead to derivative
+/// computations).
+///
+/// Basic blocks are always exited in WQM as long as some successor needs WQM.
+///
+/// There is room for improvement given better control flow analysis:
+///
+///  (1) at the top level (outside of control flow statements, and as long as
+///      kill hasn't been used), one SGPR can be saved by recovering WQM from
+///      the LiveMask (this is implemented for the entry block).
+///
+///  (2) when entire regions (e.g. if-else blocks or entire loops) only
+///      consist of exact and don't-care instructions, the switch only has to
+///      be done at the entry and exit points rather than potentially in each
+///      block of the region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-wqm"
+
+namespace {
+
+enum {
+  StateWQM = 0x1,
+  StateExact = 0x2,
+};
+
+struct InstrInfo {
+  char Needs = 0;
+  char OutNeeds = 0;
+};
+
+struct BlockInfo {
+  char Needs = 0;
+  char InNeeds = 0;
+  char OutNeeds = 0;
+};
+
+struct WorkItem {
+  const MachineBasicBlock *MBB = nullptr;
+  const MachineInstr *MI = nullptr;
+
+  WorkItem() {}
+  WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {}
+  WorkItem(const MachineInstr *MI) : MI(MI) {}
+};
+
+class SIWholeQuadMode : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  DenseMap<const MachineInstr *, InstrInfo> Instructions;
+  DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
+  SmallVector<const MachineInstr *, 2> ExecExports;
+
+  char scanInstructions(const MachineFunction &MF, std::vector<WorkItem>& Worklist);
+  void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
+  void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
+  char analyzeFunction(const MachineFunction &MF);
+
+  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SaveWQM, unsigned LiveMaskReg);
+  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SavedWQM);
+  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
+
+public:
+  static char ID;
+
+  SIWholeQuadMode() :
+    MachineFunctionPass(ID) { }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  const char *getPassName() const override {
+    return "SI Whole Quad Mode";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace
+
+char SIWholeQuadMode::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE,
+                      "SI Whole Quad Mode", false, false)
+
+char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
+
+FunctionPass *llvm::createSIWholeQuadModePass() {
+  return new SIWholeQuadMode;
+}
+
+// Scan instructions to determine which ones require an Exact execmask and
+// which ones seed WQM requirements.
+char SIWholeQuadMode::scanInstructions(const MachineFunction &MF,
+                                       std::vector<WorkItem> &Worklist) {
+  char GlobalFlags = 0;
+
+  for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
+    const MachineBasicBlock &MBB = *BI;
+
+    for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
+      const MachineInstr &MI = *II;
+      unsigned Opcode = MI.getOpcode();
+      char Flags;
+
+      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+        Flags = StateWQM;
+      } else if (TII->get(Opcode).mayStore() &&
+                 (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
+        Flags = StateExact;
+      } else {
+        // Handle export instructions with the exec mask valid flag set
+        if (Opcode == AMDGPU::EXP && MI.getOperand(4).getImm() != 0)
+          ExecExports.push_back(&MI);
+        continue;
+      }
+
+      Instructions[&MI].Needs = Flags;
+      Worklist.push_back(&MI);
+      GlobalFlags |= Flags;
+    }
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI,
+                                           std::vector<WorkItem>& Worklist) {
+  const MachineBasicBlock &MBB = *MI.getParent();
+  InstrInfo &II = Instructions[&MI];
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Control flow-type instructions that are followed by WQM computations
+  // must themselves be in WQM.
+  if ((II.OutNeeds & StateWQM) && !(II.Needs & StateWQM) &&
+      (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL))
+    II.Needs = StateWQM;
+
+  // Propagate to block level
+  BI.Needs |= II.Needs;
+  if ((BI.InNeeds | II.Needs) != BI.InNeeds) {
+    BI.InNeeds |= II.Needs;
+    Worklist.push_back(&MBB);
+  }
+
+  // Propagate backwards within block
+  if (const MachineInstr *PrevMI = MI.getPrevNode()) {
+    char InNeeds = II.Needs | II.OutNeeds;
+    if (!PrevMI->isPHI()) {
+      InstrInfo &PrevII = Instructions[PrevMI];
+      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
+        PrevII.OutNeeds |= InNeeds;
+        Worklist.push_back(PrevMI);
+      }
+    }
+  }
+
+  // Propagate WQM flag to instruction inputs
+  assert(II.Needs != (StateWQM | StateExact));
+  if (II.Needs != StateWQM)
+    return;
+
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as inputs or outputs
+    // and following them makes no sense (and would in fact be incorrect
+    // when the same VGPR is used as both an output and an input that leads
+    // to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
+      const MachineInstr *DefMI = Def.getParent();
+      InstrInfo &DefII = Instructions[DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(DefMI);
+    }
+  }
+}
+
+void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
+                                     std::vector<WorkItem>& Worklist) {
+  BlockInfo &BI = Blocks[&MBB];
+
+  // Propagate through instructions
+  if (!MBB.empty()) {
+    const MachineInstr *LastMI = &*MBB.rbegin();
+    InstrInfo &LastII = Instructions[LastMI];
+    if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
+      LastII.OutNeeds |= BI.OutNeeds;
+      Worklist.push_back(LastMI);
+    }
+  }
+
+  // Predecessor blocks must provide for our WQM/Exact needs.
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    BlockInfo &PredBI = Blocks[Pred];
+    if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
+      continue;
+
+    PredBI.OutNeeds |= BI.InNeeds;
+    PredBI.InNeeds |= BI.InNeeds;
+    Worklist.push_back(Pred);
+  }
+
+  // All successors must be prepared to accept the same set of WQM/Exact
+  // data.
+  for (const MachineBasicBlock *Succ : MBB.successors()) {
+    BlockInfo &SuccBI = Blocks[Succ];
+    if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
+      continue;
+
+    SuccBI.InNeeds |= BI.OutNeeds;
+    Worklist.push_back(Succ);
+  }
+}
+
+char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) {
+  std::vector<WorkItem> Worklist;
+  char GlobalFlags = scanInstructions(MF, Worklist);
+
+  while (!Worklist.empty()) {
+    WorkItem WI = Worklist.back();
+    Worklist.pop_back();
+
+    if (WI.MI)
+      propagateInstruction(*WI.MI, Worklist);
+    else
+      propagateBlock(*WI.MBB, Worklist);
+  }
+
+  return GlobalFlags;
+}
+
+void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SaveWQM, unsigned LiveMaskReg)
+{
+  if (SaveWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+            SaveWQM)
+        .addReg(LiveMaskReg);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC)
+        .addReg(LiveMaskReg);
+  }
+}
+
+void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SavedWQM)
+{
+  if (SavedWQM) {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+        .addReg(SavedWQM);
+  } else {
+    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC)
+        .addReg(AMDGPU::EXEC);
+  }
+}
+
+void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
+                                   bool isEntry) {
+  auto BII = Blocks.find(&MBB);
+  if (BII == Blocks.end())
+    return;
+
+  const BlockInfo &BI = BII->second;
+
+  if (!(BI.InNeeds & StateWQM))
+    return;
+
+  // This is a non-entry block that is WQM throughout, so no need to do
+  // anything.
+  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+    return;
+
+  unsigned SavedWQMReg = 0;
+  bool WQMFromExec = isEntry;
+  char State = isEntry ? StateExact : StateWQM;
+
+  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
+  while (II != IE) {
+    MachineInstr &MI = *II;
+    ++II;
+
+    // Skip instructions that are not affected by EXEC
+    if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) &&
+        !MI.isBranch() && !MI.isTerminator())
+      continue;
+
+    // Generic instructions such as COPY will either disappear by register
+    // coalescing or be lowered to SALU or VALU instructions.
+    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
+      if (MI.getNumExplicitOperands() >= 1) {
+        const MachineOperand &Op = MI.getOperand(0);
+        if (Op.isReg()) {
+          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
+            // SGPR instructions are not affected by EXEC
+            continue;
+          }
+        }
+      }
+    }
+
+    char Needs = 0;
+    char OutNeeds = 0;
+    auto InstrInfoIt = Instructions.find(&MI);
+    if (InstrInfoIt != Instructions.end()) {
+      Needs = InstrInfoIt->second.Needs;
+      OutNeeds = InstrInfoIt->second.OutNeeds;
+
+      // Make sure to switch to Exact mode before the end of the block when
+      // Exact and only Exact is needed further downstream.
+      if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
+        assert(Needs == 0);
+        Needs = StateExact;
+      }
+    }
+
+    // State switching
+    if (Needs && State != Needs) {
+      if (Needs == StateExact) {
+        assert(!SavedWQMReg);
+
+        if (!WQMFromExec && (OutNeeds & StateWQM))
+          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+      } else {
+        assert(WQMFromExec == (SavedWQMReg == 0));
+        toWQM(MBB, &MI, SavedWQMReg);
+        SavedWQMReg = 0;
+      }
+
+      State = Needs;
+    }
+
+    if (MI.getOpcode() == AMDGPU::SI_KILL)
+      WQMFromExec = false;
+  }
+
+  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+    assert(WQMFromExec == (SavedWQMReg == 0));
+    toWQM(MBB, MBB.end(), SavedWQMReg);
+  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+  }
+}
+
+bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  if (MFI->getShaderType() != ShaderType::PIXEL)
+    return false;
+
+  Instructions.clear();
+  Blocks.clear();
+  ExecExports.clear();
+
+  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
+  MRI = &MF.getRegInfo();
+
+  char GlobalFlags = analyzeFunction(MF);
+  if (!(GlobalFlags & StateWQM))
+    return false;
+
+  MachineBasicBlock &Entry = MF.front();
+  MachineInstr *EntryMI = Entry.getFirstNonPHI();
+
+  if (GlobalFlags == StateWQM) {
+    // For a shader that needs only WQM, we can just set it once.
+    BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
+    return true;
+  }
+
+  // Handle the general case
+  unsigned LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+      .addReg(AMDGPU::EXEC);
+
+  for (const auto &BII : Blocks)
+    processBlock(const_cast<MachineBasicBlock &>(*BII.first), LiveMaskReg,
+                 BII.first == &*MF.begin());
+
+  return true;
+}
Index: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
@@ -0,0 +1,348 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+
+; Check that WQM isn't triggered by image load/store intrinsics.
+;
+;CHECK-LABEL: {{^}}test1:
+;CHECK-NOT: s_wqm
+define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  ret <4 x float> %tex
+}
+
+; Check that WQM is triggered by image samples and left untouched for loads...
+;
+;CHECK-LABEL: {{^}}test2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK-NOT: exec
+;CHECK: _load_dword v0,
+define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+main_body:
+  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
+  %c.3 = extractelement <4 x i32> %c.2, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
+  %data = load float, float addrspace(1)* %gep
+  ret float %data
+}
+
+; ... but disabled for stores (and, in this simple case, not re-enabled).
+;
+;CHECK-LABEL: {{^}}test3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK-NOT: exec
+define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2
+  %wr = extractelement <4 x float> %tex, i32 1
+  store float %wr, float addrspace(1)* %gep
+  ret <4 x float> %tex
+}
+
+; Check that WQM is re-enabled when required.
+;
+;CHECK-LABEL: {{^}}test4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 {
+main_body:
+  %c.1 = mul i32 %c, %d
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
+  store float %data, float addrspace(1)* %gep
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Check a case of one branch of an if-else requiring WQM, the other requiring
+; exact.
+;
+; Note: In this particular case, the save-and-restore could be avoided if the
+; analysis understood that the two branches of the if-else are mutually
+; exclusive.
+;
+;CHECK-LABEL: {{^}}test_control_flow_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %ELSE
+;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVED]]
+;CHECK: %IF
+;CHECK: image_sample
+define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %IF, label %ELSE
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Reverse branch order compared to the previous test.
+;
+;CHECK-LABEL: {{^}}test_control_flow_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: image_sample
+;CHECK: %Flow
+;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
+;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
+;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
+;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
+;CHECK-NEXT: %ELSE
+;CHECK: store
+;CHECK: %END
+define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 {
+main_body:
+  %cmp = icmp eq i32 %z, 0
+  br i1 %cmp, label %ELSE, label %IF
+
+IF:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %tex, i32 0
+  br label %END
+
+ELSE:
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
+  ret float %r
+}
+
+; Check that branch conditions are properly marked as needing WQM...
+;
+;CHECK-LABEL: {{^}}test_control_flow_2:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: load
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmp
+define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+main_body:
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  ; The load that determines the branch (and should therefore be WQM) is
+  ; surrounded by stores that require disabled WQM.
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %coord.IF = mul i32 %coord, 3
+  br label %END
+
+ELSE:
+  %coord.ELSE = mul i32 %coord, 4
+  br label %END
+
+END:
+  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; ... but only if they really do need it.
+;
+;CHECK-LABEL: {{^}}test_control_flow_3:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: store
+;CHECK: load
+;CHECK: store
+;CHECK: v_cmp
+define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = extractelement <4 x float> %tex, i32 0
+
+  %idx.1 = extractelement <3 x i32> %idx, i32 0
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 0
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %idx.2 = extractelement <3 x i32> %idx, i32 1
+  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
+  %z = load float, float addrspace(1)* %gep.2
+
+  %idx.3 = extractelement <3 x i32> %idx, i32 2
+  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
+  %data.3 = extractelement <2 x float> %data, i32 1
+  store float %data.3, float addrspace(1)* %gep.3
+
+  %cc = fcmp ogt float %z, 0.0
+  br i1 %cc, label %IF, label %ELSE
+
+IF:
+  %tex.IF = fmul float %tex.1, 3.0
+  br label %END
+
+ELSE:
+  %tex.ELSE = fmul float %tex.1, 4.0
+  br label %END
+
+END:
+  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
+  ret float %tex.END
+}
+
+; Another test that failed at some point because of terminator handling.
+;
+;CHECK-LABEL: {{^}}test_control_flow_4:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: load
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
+;CHECK: store
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: %END
+;CHECK: image_sample
+define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 {
+main_body:
+  %cond = icmp eq i32 %y, 0
+  br i1 %cond, label %IF, label %END
+
+IF:
+  %data = load float, float addrspace(1)* %ptr
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 1
+  store float %data, float addrspace(1)* %gep
+  br label %END
+
+END:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %tex
+}
+
+; Kill is performed in WQM mode so that uniform kill behaves correctly ...
+;
+;CHECK-LABEL: {{^}}test_kill_0:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: v_cmpx_
+;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK: s_mov_b64 exec, [[SAVE]]
+;CHECK: image_sample
+define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %idx.0 = extractelement <2 x i32> %idx, i32 0
+  %gep.0 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.0
+  %data.0 = extractelement <2 x float> %data, i32 0
+  store float %data.0, float addrspace(1)* %gep.0
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  %idx.1 = extractelement <2 x i32> %idx, i32 1
+  %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
+  %data.1 = extractelement <2 x float> %data, i32 1
+  store float %data.1, float addrspace(1)* %gep.1
+
+  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %out = fadd <4 x float> %tex, %tex2
+
+  ret <4 x float> %out
+}
+
+; ... but only if WQM is necessary.
+;
+;CHECK-LABEL: {{^}}test_kill_1:
+;CHECK-NEXT: ; %main_body
+;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: image_sample
+;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;SI: buffer_store_dword
+;VI: flat_store_dword
+;CHECK-NOT: wqm
+;CHECK: v_cmpx_
+define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 {
+main_body:
+  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
+  store float %data, float addrspace(1)* %gep
+
+  call void @llvm.AMDGPU.kill(float %z)
+
+  ret <4 x float> %tex
+}
+
+declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+
+declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
+
+declare void @llvm.AMDGPU.kill(float)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind readnone }