Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -98,11 +98,13 @@
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<const MachineInstr *, 2> ExecExports;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
+  void printInfo();
+
   void markInstruction(MachineInstr &MI, char Flag,
                        std::vector<WorkItem> &Worklist);
+  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
@@ -151,6 +153,39 @@
   return new SIWholeQuadMode;
 }
 
+static std::string stateString(int state) {
+  std::string Str;
+
+  if (state & StateWQM)
+    Str = "WQM";
+  if (state & StateExact) {
+    if (!Str.empty())
+      Str += '|';
+    Str += "Exact";
+  }
+
+  return Str;
+}
+
+void SIWholeQuadMode::printInfo() {
+  for (const auto &BII : Blocks) {
+    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n";
+    dbgs() << "  InNeeds = " << stateString(BII.second.InNeeds)
+           << ", Needs = " << stateString(BII.second.Needs)
+           << ", OutNeeds = " << stateString(BII.second.OutNeeds) << "\n\n";
+
+    for (const MachineInstr &MI : *BII.first) {
+      auto III = Instructions.find(&MI);
+      if (III == Instructions.end())
+        continue;
+
+      dbgs() << "  " << MI;
+      dbgs() << "    Needs = " << stateString(III->second.Needs)
+             << ", OutNeeds = " << stateString(III->second.OutNeeds) << "\n";
+    }
+  }
+}
+
 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                       std::vector<WorkItem> &Worklist) {
   InstrInfo &II = Instructions[&MI];
@@ -168,6 +203,45 @@
   Worklist.push_back(&MI);
 }
 
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+                                  std::vector<WorkItem> &Worklist) {
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    unsigned Reg = Use.getReg();
+
+    // Handle physical registers that we need to track; this is mostly relevant
+    // for VCC, which can appear as the (implicit) input of a uniform branch,
+    // e.g. when a loop counter is stored in a VGPR.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Reg == AMDGPU::EXEC)
+        continue;
+
+      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
+        LiveRange &LR = LIS->getRegUnit(*RegUnit);
+        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+        if (!Value)
+          continue;
+
+        // Since we're in machine SSA, we do not need to track physical
+        // registers across basic blocks.
+        if (Value->isPHIDef())
+          continue;
+
+        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
+                        Worklist);
+      }
+
+      continue;
+    }
+
+    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
+      markInstruction(DefMI, StateWQM, Worklist);
+  }
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -183,16 +257,19 @@
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode)) {
         Flags = StateWQM;
+      } else if (TII->isWQM(Opcode)) {
+        // Sampling instructions don't need to produce results for all pixels
+        // in a quad, they just require all inputs of a quad to have been
+        // computed for derivatives.
+        markUsesWQM(MI, Worklist);
+        GlobalFlags |= StateWQM;
+        continue;
       } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
-        // Handle export instructions with the exec mask valid flag set
-        if (Opcode == AMDGPU::EXP) {
-          if (MI.getOperand(4).getImm() != 0)
-            ExecExports.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
@@ -266,43 +343,9 @@
 
   // Propagate WQM flag to instruction inputs
   assert(II.Needs != (StateWQM | StateExact));
-  if (II.Needs != StateWQM)
-    return;
-
-  for (const MachineOperand &Use : MI.uses()) {
-    if (!Use.isReg() || !Use.isUse())
-      continue;
 
-    unsigned Reg = Use.getReg();
-
-    // Handle physical registers that we need to track; this is mostly relevant
-    // for VCC, which can appear as the (implicit) input of a uniform branch,
-    // e.g. when a loop counter is stored in a VGPR.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (Reg == AMDGPU::EXEC)
-        continue;
-
-      for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
-        LiveRange &LR = LIS->getRegUnit(*RegUnit);
-        const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
-        if (!Value)
-          continue;
-
-        // Since we're in machine SSA, we do not need to track physical
-        // registers across basic blocks.
-        if (Value->isPHIDef())
-          continue;
-
-        markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
-                        Worklist);
-      }
-
-      continue;
-    }
-
-    for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
-      markInstruction(DefMI, StateWQM, Worklist);
-  }
+  if (II.Needs == StateWQM)
+    markUsesWQM(MI, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
@@ -402,9 +445,12 @@
   if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
     return;
 
+  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
   unsigned SavedWQMReg = 0;
   bool WQMFromExec = isEntry;
   char State = isEntry ? StateExact : StateWQM;
+  MachineInstr *FirstNonWQM = nullptr;
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   while (II != IE) {
@@ -429,21 +475,24 @@
       }
     }
 
+    DEBUG(dbgs() << "  " << MI);
+
     char Needs = 0;
     char OutNeeds = 0;
     auto InstrInfoIt = Instructions.find(&MI);
     if (InstrInfoIt != Instructions.end()) {
       Needs = InstrInfoIt->second.Needs;
       OutNeeds = InstrInfoIt->second.OutNeeds;
-
-      // Make sure to switch to Exact mode before the end of the block when
-      // Exact and only Exact is needed further downstream.
-      if (OutNeeds == StateExact && MI.isTerminator()) {
-        assert(Needs == 0);
-        Needs = StateExact;
-      }
     }
 
+    // Keep track of the first consecutive non-WQM instruction, so that we
+    // switch away from WQM as soon as possible, potentially saving a small
+    // bit of bandwidth on loads.
+    if (Needs == StateWQM)
+      FirstNonWQM = nullptr;
+    else if (!FirstNonWQM)
+      FirstNonWQM = &MI;
+
     // State switching
     if (Needs && State != Needs) {
       if (Needs == StateExact) {
@@ -452,7 +501,7 @@
         if (!WQMFromExec && (OutNeeds & StateWQM))
           SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+        toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
       } else {
         assert(WQMFromExec == (SavedWQMReg == 0));
         toWQM(MBB, &MI, SavedWQMReg);
@@ -467,7 +516,9 @@
     assert(WQMFromExec == (SavedWQMReg == 0));
     toWQM(MBB, MBB.end(), SavedWQMReg);
   } else if (BI.OutNeeds == StateExact && State != StateExact) {
-    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+    toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+                             : MBB.getFirstTerminator(),
+            0, LiveMaskReg);
   }
 }
 
@@ -487,7 +538,6 @@
 
   Instructions.clear();
   Blocks.clear();
-  ExecExports.clear();
   LiveMaskQueries.clear();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -527,6 +577,8 @@
     }
   }
 
+  DEBUG(printInfo());
+
   lowerLiveMaskQueries(LiveMaskReg);
 
   // Handle the general case
Index: test/CodeGen/AMDGPU/skip-if-dead.ll
===================================================================
--- test/CodeGen/AMDGPU/skip-if-dead.ll
+++ test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -345,10 +345,10 @@
 ; CHECK: v_cmpx_le_f32_e32 vcc, 0,
 ; CHECK: [[BB4]]:
 ; CHECK: s_or_b64 exec, exec
+; CHECK: s_and_b64 exec, exec,
 ; CHECK: image_sample_c
 
 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
-; CHECK: s_and_b64 exec, exec,
 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -36,8 +36,8 @@
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
@@ -62,7 +62,8 @@
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
-;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
 main_body:
   %c.1 = mul i32 %c, %d
@@ -70,7 +71,9 @@
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
 
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %tex
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
 }
 
 ; Check a case of one branch of an if-else requiring WQM, the other requiring
@@ -90,6 +93,7 @@
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
@@ -97,7 +101,9 @@
 
 IF:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %data.if = extractelement <4 x float> %tex, i32 0
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
 ELSE:
@@ -117,6 +123,7 @@
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: %IF
 ;CHECK: image_sample
+;CHECK: image_sample
 ;CHECK: %Flow
 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
@@ -136,7 +143,9 @@
 
 IF:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %data.if = extractelement <4 x float> %tex, i32 0
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
 ELSE:
@@ -202,35 +211,27 @@
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;CHECK: store
-;CHECK: load
+;CHECK: image_sample
 ;CHECK: store
 ;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = extractelement <4 x float> %tex, i32 0
-
-  %idx.1 = extractelement <3 x i32> %idx, i32 0
-  %data.1 = extractelement <2 x float> %data, i32 0
-  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
-
-  %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %dtex.1 = extractelement <4 x float> %dtex, i32 0
 
-  %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %data.3 = extractelement <2 x float> %data, i32 1
-  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
 
-  %cc = fcmp ogt float %z, 0.0
+  %cc = fcmp ogt float %dtex.1, 0.0
   br i1 %cc, label %IF, label %ELSE
 
 IF:
-  %tex.IF = fmul float %tex.1, 3.0
+  %tex.IF = fmul float %dtex.1, 3.0
   br label %END
 
 ELSE:
-  %tex.ELSE = fmul float %tex.1, 4.0
+  %tex.ELSE = fmul float %dtex.1, 4.0
   br label %END
 
 END:
@@ -245,12 +246,13 @@
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: %IF
-;CHECK: load
 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
+;CHECK: load
 ;CHECK: store
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: %END
 ;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
@@ -263,7 +265,9 @@
 
 END:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %tex
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
 }
 
 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
@@ -272,8 +276,8 @@
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
 ;CHECK: buffer_store_dword
 ;CHECK: s_wqm_b64 exec, exec
 ;CHECK: v_cmpx_
@@ -296,7 +300,9 @@
   call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
 
   %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %out = fadd <4 x float> %tex, %tex2
+  %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %out = fadd <4 x float> %tex, %dtex
 
   ret <4 x float> %out
 }
@@ -309,18 +315,21 @@
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: image_sample
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: image_sample
 ; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
   call void @llvm.AMDGPU.kill(float %z)
 
-  ret <4 x float> %tex
+  ret <4 x float> %dtex
 }
 
 ; Check prolog shaders.
@@ -390,8 +399,8 @@
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: buffer_load_dword
 
-; CHECK: image_sample
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK: image_sample
 ; CHECK: buffer_store_dwordx4
 define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
 entry: