diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -487,6 +487,8 @@
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
+  bool HasImplicitDerivatives =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -506,6 +508,11 @@
         // If LOD is not supported WQM is not needed.
         if (!ST->hasExtendedImageInsts())
           continue;
+        // Only generate implicit WQM if implicit derivatives are required.
+        // This avoids inserting unintended WQM if a shader type without
+        // implicit derivatives uses an image sampling instruction.
+        if (!HasImplicitDerivatives)
+          continue;
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -391,27 +391,25 @@
 ; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s18, -1
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
 ; GCN-NEXT:    s_mov_b32 s19, 0xe00000
 ; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mov_b64 s[12:13], exec
-; GCN-NEXT:    s_wqm_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x24
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
 ; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
 ; GCN-NEXT:    s_brev_b32 s0, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_mov_b32 s1, s0
 ; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v2, v0
@@ -424,25 +422,22 @@
 ; GCN-SCRATCH-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GCN-SCRATCH-NEXT:    s_mov_b32 s9, exec_lo
-; GCN-SCRATCH-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
 ; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-SCRATCH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x44
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    s_brev_b32 s8, 1
+; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
 ; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
+; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT:    s_and_b32 exec_lo, exec_lo, s9
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
 ; GCN-SCRATCH-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -9,19 +9,22 @@
 
 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
 ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 
 ; GFX9: v_mov_b32_e32 v36, v16
 ; GFX9-NEXT: v_mov_b32_e32 v35, v15
 ; GFX9-NEXT: v_mov_b32_e32 v34, v14
 ; GFX9-NEXT: v_mov_b32_e32 v33, v13
 ; GFX9-NEXT: v_mov_b32_e32 v32, v12
+
+; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
 ; GFX9: ;;#ASMSTART
 ; GFX9-NEXT: ;;#ASMEND
 ; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
@@ -39,10 +42,6 @@
 ;
 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 
 ; GFX10: v_mov_b32_e32 v36, v16
 ; GFX10-NEXT: v_mov_b32_e32 v35, v15
@@ -50,10 +49,16 @@
 ; GFX10-NEXT: v_mov_b32_e32 v33, v13
 ; GFX10-NEXT: v_mov_b32_e32 v32, v12
 
+; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
 ; GFX10: ;;#ASMSTART
 ; GFX10-NEXT: ;;#ASMEND
 
 ; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: s_addk_i32 s32, 0x400
 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
@@ -100,6 +105,7 @@
 ; GFX9-NEXT: v_mov_b32_e32 v41, v12
 
 ; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
@@ -133,12 +139,9 @@
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v41, v16
+; GFX10-NEXT: v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v42, v15
-; GFX10-NEXT: v_mov_b32_e32 v43, v14
-; GFX10-NEXT: v_mov_b32_e32 v44, v13
-; GFX10-NEXT: v_mov_b32_e32 v45, v12
+; GFX10-NEXT: v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT: s_waitcnt vmcnt(0)
 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -1,5 +1,48 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o -  %s | FileCheck %s
 
+--- |
+  define amdgpu_ps void @test_strict_wwm_scc() {
+    ret void
+  }
+  define amdgpu_ps void @test_strict_wwm_scc2() {
+    ret void
+  }
+  define amdgpu_ps void @no_cfg() {
+    ret void
+  }
+  define amdgpu_ps void @copy_exec() {
+    ret void
+  }
+  define amdgpu_ps void @scc_always_live() {
+    ret void
+  }
+  define amdgpu_ps void @test_wwm_set_inactive_propagation() {
+    ret void
+  }
+  define amdgpu_ps void @test_wqm_lr_phi() {
+    ret void
+  }
+  define amdgpu_cs void @no_wqm_in_cs() {
+    ret void
+  }
+  define amdgpu_es void @no_wqm_in_es() {
+    ret void
+  }
+  define amdgpu_gs void @no_wqm_in_gs() {
+    ret void
+  }
+  define amdgpu_hs void @no_wqm_in_hs() {
+    ret void
+  }
+  define amdgpu_ls void @no_wqm_in_ls() {
+    ret void
+  }
+  define amdgpu_vs void @no_wqm_in_vs() {
+    ret void
+  }
+...
+---
+
 ---
 # Check for awareness that s_or_saveexec_b64 clobbers SCC
 #
@@ -298,3 +341,105 @@
     $vgpr1 = COPY %4.sub1:vreg_128
     SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
 ...
+
+---
+#CHECK-LABEL: name: no_wqm_in_cs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_cs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_es
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_es
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_gs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_gs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_hs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_hs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_ls
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_ls
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_vs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_vs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...