Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -100,9 +100,11 @@
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<const MachineBasicBlock *, BlockInfo> Blocks;
-  SmallVector<const MachineInstr *, 2> ExecExports;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
 
+  void printInfo();
+
+  void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);
   char scanInstructions(MachineFunction &MF, std::vector<WorkItem>& Worklist);
   void propagateInstruction(const MachineInstr &MI, std::vector<WorkItem>& Worklist);
   void propagateBlock(const MachineBasicBlock &MBB, std::vector<WorkItem>& Worklist);
@@ -149,6 +151,76 @@
   return new SIWholeQuadMode;
 }
 
+static std::string stateString(int state) {
+  std::string Str;
+
+  if (state & StateWQM)
+    Str = "WQM";
+  if (state & StateExact) {
+    if (!Str.empty())
+      Str += "|";
+    Str += "Exact";
+  }
+
+  return Str;
+}
+
+void SIWholeQuadMode::printInfo() {
+  for (const auto &BII : Blocks) {
+    dbgs() << "\nBB#" << BII.first->getNumber() << ":\n";
+    dbgs() << "  InNeeds = " << stateString(BII.second.InNeeds)
+           << ", Needs = " << stateString(BII.second.Needs)
+           << ", OutNeeds = " << stateString(BII.second.OutNeeds) << "\n\n";
+
+    for (const MachineInstr &MI : *BII.first) {
+      auto III = Instructions.find(&MI);
+      if (III == Instructions.end())
+        continue;
+
+      dbgs() << "  " << MI;
+      dbgs() << "    Needs = " << stateString(III->second.Needs)
+             << ", OutNeeds = " << stateString(III->second.OutNeeds) << "\n";
+    }
+  }
+}
+
+/// Mark all instructions defining the uses in \p MI as WQM.
+void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,
+                                  std::vector<WorkItem> &Worklist) {
+  for (const MachineOperand &Use : MI.uses()) {
+    if (!Use.isReg() || !Use.isUse())
+      continue;
+
+    // At this point, physical registers appear as (shader) inputs or
+    // non-monolithic shader outputs. Following those makes no sense (and would
+    // in fact be incorrect when the same VGPR is used as both an output and an
+    // input that leads to a NeedsWQM instruction).
+    //
+    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
+    // have to trace this, in practice it happens for 64-bit computations like
+    // pointers where both dwords are followed already anyway.
+    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
+      continue;
+
+    for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
+      const MachineInstr *DefMI = Def.getParent();
+      InstrInfo &DefII = Instructions[DefMI];
+
+      // Obviously skip if DefMI is already flagged as NeedWQM.
+      //
+      // The instruction might also be flagged as NeedExact. This happens when
+      // the result of an atomic is used in a WQM computation. In this case,
+      // the atomic must not run for helper pixels and the WQM result is
+      // undefined.
+      if (DefII.Needs != 0)
+        continue;
+
+      DefII.Needs = StateWQM;
+      Worklist.push_back(DefMI);
+    }
+  }
+}
+
 // Scan instructions to determine which ones require an Exact execmask and
 // which ones seed WQM requirements.
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
@@ -164,17 +236,19 @@
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
 
-      if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode)) {
         Flags = StateWQM;
+      } else if (TII->isWQM(Opcode)) {
+        // Sampling instructions don't need to produce results for all pixels
+        // in a quad, they just require all inputs of a quad to have been
+        // computed for derivatives.
+        markUsesWQM(MI, Worklist);
+        GlobalFlags |= StateWQM;
       } else if (TII->get(Opcode).mayStore() &&
                  (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) {
         Flags = StateExact;
       } else {
-        // Handle export instructions with the exec mask valid flag set
-        if (Opcode == AMDGPU::EXP) {
-          if (MI.getOperand(4).getImm() != 0)
-            ExecExports.push_back(&MI);
-        } else if (Opcode == AMDGPU::SI_PS_LIVE) {
+        if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
@@ -249,41 +323,8 @@
 
   // Propagate WQM flag to instruction inputs
   assert(II.Needs != (StateWQM | StateExact));
-  if (II.Needs != StateWQM)
-    return;
-
-  for (const MachineOperand &Use : MI.uses()) {
-    if (!Use.isReg() || !Use.isUse())
-      continue;
-
-    // At this point, physical registers appear as inputs or outputs
-    // and following them makes no sense (and would in fact be incorrect
-    // when the same VGPR is used as both an output and an input that leads
-    // to a NeedsWQM instruction).
-    //
-    // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
-    // have to trace this, in practice it happens for 64-bit computations like
-    // pointers where both dwords are followed already anyway.
-    if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
-      continue;
-
-    for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) {
-      const MachineInstr *DefMI = Def.getParent();
-      InstrInfo &DefII = Instructions[DefMI];
-
-      // Obviously skip if DefMI is already flagged as NeedWQM.
-      //
-      // The instruction might also be flagged as NeedExact. This happens when
-      // the result of an atomic is used in a WQM computation. In this case,
-      // the atomic must not run for helper pixels and the WQM result is
-      // undefined.
-      if (DefII.Needs != 0)
-        continue;
-
-      DefII.Needs = StateWQM;
-      Worklist.push_back(DefMI);
-    }
-  }
+  if (II.Needs == StateWQM)
+    markUsesWQM(MI, Worklist);
 }
 
 void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB,
@@ -384,9 +425,12 @@
   if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
     return;
 
+  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");
+
   unsigned SavedWQMReg = 0;
   bool WQMFromExec = isEntry;
   char State = isEntry ? StateExact : StateWQM;
+  MachineInstr *FirstNonWQM = nullptr;
 
   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
   while (II != IE) {
@@ -412,21 +456,24 @@
       }
     }
 
+    DEBUG(dbgs() << "  " << MI);
+
     char Needs = 0;
     char OutNeeds = 0;
     auto InstrInfoIt = Instructions.find(&MI);
     if (InstrInfoIt != Instructions.end()) {
       Needs = InstrInfoIt->second.Needs;
       OutNeeds = InstrInfoIt->second.OutNeeds;
-
-      // Make sure to switch to Exact mode before the end of the block when
-      // Exact and only Exact is needed further downstream.
-      if (OutNeeds == StateExact && (MI.isBranch() || MI.isTerminator())) {
-        assert(Needs == 0);
-        Needs = StateExact;
-      }
     }
 
+    // Keep track of the first consecutive non-WQM instruction, so that we
+    // switch away from WQM as soon as possible, potentially saving a small
+    // bit of bandwidth on loads.
+    if (Needs == StateWQM)
+      FirstNonWQM = nullptr;
+    else if (!FirstNonWQM)
+      FirstNonWQM = &MI;
+
     // State switching
     if (Needs && State != Needs) {
       if (Needs == StateExact) {
@@ -435,7 +482,7 @@
         if (!WQMFromExec && (OutNeeds & StateWQM))
           SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
 
-        toExact(MBB, &MI, SavedWQMReg, LiveMaskReg);
+        toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
       } else {
         assert(WQMFromExec == (SavedWQMReg == 0));
         toWQM(MBB, &MI, SavedWQMReg);
@@ -453,7 +500,9 @@
     assert(WQMFromExec == (SavedWQMReg == 0));
     toWQM(MBB, MBB.end(), SavedWQMReg);
   } else if (BI.OutNeeds == StateExact && State != StateExact) {
-    toExact(MBB, MBB.end(), 0, LiveMaskReg);
+    toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+                             : MBB.getFirstTerminator(),
+            0, LiveMaskReg);
   }
 }
 
@@ -473,7 +522,6 @@
 
   Instructions.clear();
   Blocks.clear();
-  ExecExports.clear();
   LiveMaskQueries.clear();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@@ -509,6 +557,8 @@
     return true;
   }
 
+  DEBUG(printInfo());
+
   lowerLiveMaskQueries(LiveMaskReg);
   EntryMI = nullptr;
 
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -36,8 +36,8 @@
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
@@ -62,14 +62,17 @@
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
 ;CHECK: store
 ;CHECK: s_wqm_b64 exec, exec
-;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf
+;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
 main_body:
   %c.1 = mul i32 %c, %d
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1
   store float %data, float addrspace(1)* %gep
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %tex
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
 }
 
 ; Check a case of one branch of an if-else requiring WQM, the other requiring
@@ -89,6 +92,7 @@
 ;CHECK: s_mov_b64 exec, [[SAVED]]
 ;CHECK: %IF
 ;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
@@ -96,7 +100,9 @@
 
 IF:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %data.if = extractelement <4 x float> %tex, i32 0
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
 ELSE:
@@ -117,6 +123,7 @@
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: %IF
 ;CHECK: image_sample
+;CHECK: image_sample
 ;CHECK: %Flow
 ;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
 ;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
@@ -136,7 +143,9 @@
 
 IF:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %data.if = extractelement <4 x float> %tex, i32 0
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
 ELSE:
@@ -206,38 +215,29 @@
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
-;CHECK: store
-;CHECK: load
+;CHECK: image_sample
 ;CHECK: store
 ;CHECK: v_cmp
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, i32 %coord) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = extractelement <4 x float> %tex, i32 0
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %dtex.1 = extractelement <4 x float> %dtex, i32 0
 
   %idx.1 = extractelement <3 x i32> %idx, i32 0
   %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1
-  %data.1 = extractelement <2 x float> %data, i32 0
-  store float %data.1, float addrspace(1)* %gep.1
-
-  %idx.2 = extractelement <3 x i32> %idx, i32 1
-  %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2
-  %z = load float, float addrspace(1)* %gep.2
+  store float %dtex.1, float addrspace(1)* %gep.1
 
-  %idx.3 = extractelement <3 x i32> %idx, i32 2
-  %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3
-  %data.3 = extractelement <2 x float> %data, i32 1
-  store float %data.3, float addrspace(1)* %gep.3
-
-  %cc = fcmp ogt float %z, 0.0
+  %cc = fcmp ogt float %dtex.1, 0.0
   br i1 %cc, label %IF, label %ELSE
 
 IF:
-  %tex.IF = fmul float %tex.1, 3.0
+  %tex.IF = fmul float %dtex.1, 3.0
   br label %END
 
 ELSE:
-  %tex.ELSE = fmul float %tex.1, 4.0
+  %tex.ELSE = fmul float %dtex.1, 4.0
   br label %END
 
 END:
@@ -252,12 +252,13 @@
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK: %IF
-;CHECK: load
 ;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
+;CHECK: load
 ;CHECK: store
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: %END
 ;CHECK: image_sample
+;CHECK: image_sample
 define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
@@ -271,7 +272,9 @@
 
 END:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %tex
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
 }
 
 ; Kill is performed in WQM mode so that uniform kill behaves correctly ...
@@ -280,8 +283,8 @@
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK: s_and_b64 exec, exec, [[ORIG]]
+;CHECK: image_sample
 ;SI: buffer_store_dword
 ;VI: flat_store_dword
 ;CHECK: s_wqm_b64 exec, exec
@@ -308,7 +311,9 @@
   store float %data.1, float addrspace(1)* %gep.1
 
   %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %out = fadd <4 x float> %tex, %tex2
+  %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %out = fadd <4 x float> %tex, %dtex
 
   ret <4 x float> %out
 }
@@ -321,6 +326,7 @@
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: image_sample
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
+; CHECK: image_sample
 ; SI: buffer_store_dword
 ; VI: flat_store_dword
 ; CHECK-NOT: wqm
@@ -328,13 +334,15 @@
 define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
 main_body:
   %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
 
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %idx
   store float %data, float addrspace(1)* %gep
 
   call void @llvm.AMDGPU.kill(float %z)
 
-  ret <4 x float> %tex
+  ret <4 x float> %dtex
 }
 
 ; Check prolog shaders.