Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -249,6 +249,7 @@
       case ISD::STORE:
       case ISD::BUILD_VECTOR:
       case ISD::BITCAST:
+      case ISD::UNDEF:
       case ISD::EXTRACT_VECTOR_ELT:
       case ISD::INSERT_VECTOR_ELT:
       case ISD::EXTRACT_SUBVECTOR:
@@ -516,6 +517,7 @@
         case ISD::STORE:
         case ISD::BUILD_VECTOR:
         case ISD::BITCAST:
+        case ISD::UNDEF:
         case ISD::EXTRACT_VECTOR_ELT:
         case ISD::INSERT_VECTOR_ELT:
         case ISD::INSERT_SUBVECTOR:
Index: llvm/test/CodeGen/AMDGPU/commute-shifts.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/commute-shifts.ll
+++ llvm/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -5,14 +5,6 @@
 define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 ; SI-LABEL: main:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_mov_b32 s0, 0
-; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_mov_b32 s2, s0
-; SI-NEXT:    s_mov_b32 s3, s0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s0
-; SI-NEXT:    s_mov_b32 s6, s0
-; SI-NEXT:    s_mov_b32 s7, s0
 ; SI-NEXT:    image_load v2, v0, s[0:7] dmask:0x1 unorm
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 7, v0
@@ -26,14 +18,6 @@
 ;
 ; VI-LABEL: main:
 ; VI:       ; %bb.0: ; %bb
-; VI-NEXT:    s_mov_b32 s0, 0
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_mov_b32 s2, s0
-; VI-NEXT:    s_mov_b32 s3, s0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s0
-; VI-NEXT:    s_mov_b32 s6, s0
-; VI-NEXT:    s_mov_b32 s7, s0
 ; VI-NEXT:    image_load v2, v0, s[0:7] dmask:0x1 unorm
 ; VI-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; VI-NEXT:    v_and_b32_e32 v0, 7, v0
Index: llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -213,7 +213,7 @@
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
+  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
   store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
   ret void
 }
@@ -266,7 +266,7 @@
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
+  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ]
   store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
   ret void
 }
Index: llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
+++ llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll
@@ -4,16 +4,8 @@
 define amdgpu_ps float @_amdgpu_ps_main() #0 {
 ; GCN-LABEL: _amdgpu_ps_main:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s0
-; GCN-NEXT:    s_mov_b32 s6, s0
-; GCN-NEXT:    s_mov_b32 s7, s0
 ; GCN-NEXT:    image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_clause 0x1
 ; GCN-NEXT:    image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
Index: llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -97,14 +97,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX9-NEXT:    s_branch .LBB0_4
 ; GFX9-NEXT:  .LBB0_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB0_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -237,14 +230,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:    s_branch .LBB1_4
 ; GFX9-NEXT:  .LBB1_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB1_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -377,14 +363,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:    s_branch .LBB2_4
 ; GFX9-NEXT:  .LBB2_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    v_mov_b32_e32 v3, s9
-; GFX9-NEXT:    v_mov_b32_e32 v4, s10
-; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GFX9-NEXT:  .LBB2_3: ; %T
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -555,22 +534,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_3
 ; GFX9-NEXT:    s_branch .LBB3_4
 ; GFX9-NEXT:  .LBB3_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB3_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -743,22 +707,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:    s_branch .LBB4_4
 ; GFX9-NEXT:  .LBB4_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB4_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
@@ -931,22 +880,7 @@
 ; GFX9-NEXT:    s_cbranch_execz .LBB5_3
 ; GFX9-NEXT:    s_branch .LBB5_4
 ; GFX9-NEXT:  .LBB5_2:
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_mov_b32 s10, s8
-; GFX9-NEXT:    s_mov_b32 s11, s8
-; GFX9-NEXT:    s_mov_b32 s12, s8
-; GFX9-NEXT:    s_mov_b32 s13, s8
-; GFX9-NEXT:    s_mov_b32 s14, s8
-; GFX9-NEXT:    s_mov_b32 s15, s8
-; GFX9-NEXT:    v_mov_b32_e32 v4, s8
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
-; GFX9-NEXT:    v_mov_b32_e32 v6, s10
-; GFX9-NEXT:    v_mov_b32_e32 v7, s11
-; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    v_mov_b32_e32 v9, s13
-; GFX9-NEXT:    v_mov_b32_e32 v10, s14
-; GFX9-NEXT:    v_mov_b32_e32 v11, s15
+; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX9-NEXT:  .LBB5_3: ; %T
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc
Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -382,18 +382,10 @@
 ; GCN-LABEL: insertelement_to_sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s12, 0
-; GCN-NEXT:    s_mov_b32 s4, s12
-; GCN-NEXT:    s_mov_b32 s5, s12
-; GCN-NEXT:    s_mov_b32 s6, s12
-; GCN-NEXT:    s_mov_b32 s7, s12
-; GCN-NEXT:    s_mov_b32 s8, s12
-; GCN-NEXT:    s_mov_b32 s9, s12
-; GCN-NEXT:    s_mov_b32 s10, s12
-; GCN-NEXT:    s_mov_b32 s11, s12
-; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
Index: llvm/test/CodeGen/AMDGPU/select-undef.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/select-undef.ll
+++ llvm/test/CodeGen/AMDGPU/select-undef.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}select_undef_lhs:
 ; GCN: s_waitcnt
@@ -43,3 +43,220 @@
 }
 
 declare float @llvm.amdgcn.rcp.f32(float)
+
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v6f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef
+  %add = fadd <6 x float> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <6 x float> %add, <6 x float> addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v6i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef
+  %add = add <6 x i32> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v5f32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef
+  %add = fadd <5 x float> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <5 x float> %add, <5 x float> addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v5i32:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef
+  %add = add <5 x i32> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v3f64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr
+  %add = fadd <3 x double> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v3i64:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr
+  %add = add <3 x i64> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v4f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr
+  %add = fadd <4 x half> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v4i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr
+  %add = add <4 x i16> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr
+  ret void
+}
+
+; Make sure the vector undef isn't lowered into 0s.
+; GCN-LABEL: {{^}}undef_v2f16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr
+  %add = fadd <2 x half> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr
+  ret void
+}
+
+; GCN-LABEL: {{^}}undef_v2i16:
+; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
+; GCN: s_cbranch_vccnz
+define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) {
+entry:
+  br label %loop
+
+loop:
+  %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
+  %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr
+  %add = add <2 x i16> %load, %phi
+  br i1 %cond, label %loop, label %ret
+
+ret:
+  store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr
+  ret void
+}
+
+; We were expanding undef vectors into zero vectors. Optimizations
+; would then see we used no elements of the vector, and reform the
+; undef vector resulting in a combiner loop.
+; GCN-LABEL: {{^}}inf_loop_undef_vector:
+; GCN: s_waitcnt
+; GCN-NEXT: v_mad_u64_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_mul_lo_u32
+; GCN-NEXT: v_add3_u32
+; GCN-NEXT: global_store_dwordx2
+define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
+  %i = insertelement <6 x float> %arg, float %arg1, i64 2
+  %i3 = bitcast <6 x float> %i to <3 x i64>
+  %i4 = extractelement <3 x i64> %i3, i64 0
+  %i5 = extractelement <3 x i64> %i3, i64 1
+  %i6 = mul i64 %i5, %arg2
+  %i7 = add i64 %i6, %i4
+  store volatile i64 %i7, i64 addrspace(1)* undef, align 4
+  ret void
+}
Index: llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1406,28 +1406,20 @@
 define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 {
 ; SI-LABEL: if_after_kill_block:
 ; SI:       ; %bb.0: ; %bb
-; SI-NEXT:    s_mov_b64 s[2:3], exec
+; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    s_wqm_b64 exec, exec
-; SI-NEXT:    s_mov_b32 s0, 0
 ; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; SI-NEXT:    s_cbranch_execz .LBB13_3
 ; SI-NEXT:  ; %bb.1: ; %bb3
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
 ; SI-NEXT:    s_cbranch_scc0 .LBB13_6
 ; SI-NEXT:  ; %bb.2: ; %bb3
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
 ; SI-NEXT:  .LBB13_3: ; %bb4
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_mov_b32 s1, s0
-; SI-NEXT:    s_mov_b32 s2, s0
-; SI-NEXT:    s_mov_b32 s3, s0
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_mov_b32 s5, s0
-; SI-NEXT:    s_mov_b32 s6, s0
-; SI-NEXT:    s_mov_b32 s7, s0
+; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1448,28 +1440,20 @@
 ;
 ; GFX10-WAVE64-LABEL: if_after_kill_block:
 ; GFX10-WAVE64:       ; %bb.0: ; %bb
-; GFX10-WAVE64-NEXT:    s_mov_b64 s[2:3], exec
+; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX10-WAVE64-NEXT:    s_wqm_b64 exec, exec
 ; GFX10-WAVE64-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
-; GFX10-WAVE64-NEXT:    s_mov_b32 s0, 0
-; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10-WAVE64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX10-WAVE64-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
+; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], vcc
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
 ; GFX10-WAVE64-NEXT:  .LBB13_3: ; %bb4
-; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX10-WAVE64-NEXT:    s_mov_b32 s1, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s2, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s3, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s4, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s5, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s6, s0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s7, s0
+; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-WAVE64-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE64-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
@@ -1488,28 +1472,20 @@
 ;
 ; GFX10-WAVE32-LABEL: if_after_kill_block:
 ; GFX10-WAVE32:       ; %bb.0: ; %bb
-; GFX10-WAVE32-NEXT:    s_mov_b32 s1, exec_lo
+; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-WAVE32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-WAVE32-NEXT:    v_cmp_nle_f32_e32 vcc_lo, 0, v1
-; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
-; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
-; GFX10-WAVE32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-WAVE32-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-WAVE32-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
-; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, vcc_lo
+; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
 ; GFX10-WAVE32-NEXT:  .LBB13_3: ; %bb4
-; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-WAVE32-NEXT:    s_mov_b32 s1, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s2, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s3, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s4, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s5, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s6, s0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s7, s0
+; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-WAVE32-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 0, v0
@@ -1528,29 +1504,22 @@
 ;
 ; GFX11-LABEL: if_after_kill_block:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b64 s[2:3], exec
+; GFX11-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX11-NEXT:    s_wqm_b64 exec, exec
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b64 s[4:5], exec
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11-NEXT:    v_cmpx_nle_f32_e32 0, v1
-; GFX11-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX11-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
 ; GFX11-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX11-NEXT:  ; %bb.1: ; %bb3
 ; GFX11-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
-; GFX11-NEXT:    s_and_not1_b64 s[2:3], s[2:3], vcc
+; GFX11-NEXT:    s_and_not1_b64 s[0:1], s[0:1], vcc
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB13_6
 ; GFX11-NEXT:  ; %bb.2: ; %bb3
 ; GFX11-NEXT:    s_and_not1_b64 exec, exec, vcc
 ; GFX11-NEXT:  .LBB13_3: ; %bb4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s0
-; GFX11-NEXT:    s_mov_b32 s6, s0
-; GFX11-NEXT:    s_mov_b32 s7, s0
+; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX11-NEXT:    image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1593,19 +1562,11 @@
 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; SI-LABEL: cbranch_kill:
 ; SI:       ; %bb.0: ; %.entry
-; SI-NEXT:    s_mov_b32 s4, 0
 ; SI-NEXT:    s_mov_b64 s[0:1], exec
 ; SI-NEXT:    v_mov_b32_e32 v4, 0
 ; SI-NEXT:    v_mov_b32_e32 v2, v1
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    s_mov_b32 s5, s4
-; SI-NEXT:    s_mov_b32 s6, s4
-; SI-NEXT:    s_mov_b32 s7, s4
-; SI-NEXT:    s_mov_b32 s8, s4
-; SI-NEXT:    s_mov_b32 s9, s4
-; SI-NEXT:    s_mov_b32 s10, s4
-; SI-NEXT:    s_mov_b32 s11, s4
-; SI-NEXT:    image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da
+; SI-NEXT:    image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
@@ -1636,16 +1597,8 @@
 ; GFX10-WAVE64-LABEL: cbranch_kill:
 ; GFX10-WAVE64:       ; %bb.0: ; %.entry
 ; GFX10-WAVE64-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-WAVE64-NEXT:    s_mov_b32 s4, 0
 ; GFX10-WAVE64-NEXT:    s_mov_b64 s[0:1], exec
-; GFX10-WAVE64-NEXT:    s_mov_b32 s5, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s6, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s7, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s8, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s9, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s10, s4
-; GFX10-WAVE64-NEXT:    s_mov_b32 s11, s4
-; GFX10-WAVE64-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE64-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-WAVE64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE64-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
@@ -1676,16 +1629,8 @@
 ; GFX10-WAVE32-LABEL: cbranch_kill:
 ; GFX10-WAVE32:       ; %bb.0: ; %.entry
 ; GFX10-WAVE32-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-WAVE32-NEXT:    s_mov_b32 s4, 0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s0, exec_lo
-; GFX10-WAVE32-NEXT:    s_mov_b32 s5, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s6, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s7, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s8, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s9, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s10, s4
-; GFX10-WAVE32-NEXT:    s_mov_b32 s11, s4
-; GFX10-WAVE32-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX10-WAVE32-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX10-WAVE32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-WAVE32-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 0, v1
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
@@ -1716,16 +1661,8 @@
 ; GFX11-LABEL: cbranch_kill:
 ; GFX11:       ; %bb.0: ; %.entry
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], exec
-; GFX11-NEXT:    s_mov_b32 s5, s4
-; GFX11-NEXT:    s_mov_b32 s6, s4
-; GFX11-NEXT:    s_mov_b32 s7, s4
-; GFX11-NEXT:    s_mov_b32 s8, s4
-; GFX11-NEXT:    s_mov_b32 s9, s4
-; GFX11-NEXT:    s_mov_b32 s10, s4
-; GFX11-NEXT:    s_mov_b32 s11, s4
-; GFX11-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX11-NEXT:    image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
 ; GFX11-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmpx_ge_f32_e32 0, v1
Index: llvm/test/CodeGen/AMDGPU/v1024.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/v1024.ll
+++ llvm/test/CodeGen/AMDGPU/v1024.ll
@@ -10,6 +10,7 @@
 entry:
   %alloca = alloca <32 x i32>, align 16, addrspace(5)
   %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false)
   br i1 undef, label %if.then.i.i, label %if.else.i
 
 if.then.i.i:                                      ; preds = %entry
@@ -24,6 +25,7 @@
   ret void
 }
 
+declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg)
 declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
 
 declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg)
Index: llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -14,7 +14,6 @@
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v16
@@ -22,13 +21,6 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v12
-; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_mov_b32 s6, s4
-; GFX9-NEXT:    s_mov_b32 s7, s4
-; GFX9-NEXT:    s_mov_b32 s8, s4
-; GFX9-NEXT:    s_mov_b32 s9, s4
-; GFX9-NEXT:    s_mov_b32 s10, s4
-; GFX9-NEXT:    s_mov_b32 s11, s4
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -82,16 +74,8 @@
 ; GFX10-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX10-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v32, v12
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_mov_b32 s5, s4
-; GFX10-NEXT:    s_mov_b32 s6, s4
-; GFX10-NEXT:    s_mov_b32 s7, s4
-; GFX10-NEXT:    s_mov_b32 s8, s4
-; GFX10-NEXT:    s_mov_b32 s9, s4
-; GFX10-NEXT:    s_mov_b32 s10, s4
-; GFX10-NEXT:    s_mov_b32 s11, s4
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -145,16 +129,8 @@
 ; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v12
-; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s2, s0
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s0
-; GFX11-NEXT:    s_mov_b32 s6, s0
-; GFX11-NEXT:    s_mov_b32 s7, s0
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
@@ -225,65 +201,41 @@
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s41, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX9-NEXT:    s_mov_b32 s36, 0
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s43, 9
 ; GFX9-NEXT:    v_mov_b32_e32 v45, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v44, v15
 ; GFX9-NEXT:    v_mov_b32_e32 v43, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v42, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v12
-; GFX9-NEXT:    s_mov_b32 s37, s36
-; GFX9-NEXT:    s_mov_b32 s38, s36
-; GFX9-NEXT:    s_mov_b32 s39, s36
-; GFX9-NEXT:    s_mov_b32 s40, s36
-; GFX9-NEXT:    s_mov_b32 s41, s36
-; GFX9-NEXT:    s_mov_b32 s42, s36
-; GFX9-NEXT:    s_mov_b32 s43, s36
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
-; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
@@ -298,66 +250,42 @@
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, v16
 ; GFX10-NEXT:    v_mov_b32_e32 v42, v15
 ; GFX10-NEXT:    v_mov_b32_e32 v43, v14
-; GFX10-NEXT:    v_mov_b32_e32 v44, v13
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v44, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v45, v12
-; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX10-NEXT:    s_mov_b32 s36, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX10-NEXT:    s_mov_b32 s37, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX10-NEXT:    s_mov_b32 s38, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX10-NEXT:    s_mov_b32 s39, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX10-NEXT:    s_mov_b32 s40, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s41, 7
-; GFX10-NEXT:    s_mov_b32 s41, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX10-NEXT:    s_mov_b32 s42, s36
-; GFX10-NEXT:    v_writelane_b32 v40, s43, 9
-; GFX10-NEXT:    s_mov_b32 s43, s36
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_clause 0x4
 ; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -372,7 +300,7 @@
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v40, s33, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
@@ -380,56 +308,32 @@
 ; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v45, s33
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v45, v12
-; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
-; GFX11-NEXT:    s_mov_b32 s36, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
-; GFX11-NEXT:    s_mov_b32 s37, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
-; GFX11-NEXT:    s_mov_b32 s38, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
-; GFX11-NEXT:    s_mov_b32 s39, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s40, 6
-; GFX11-NEXT:    s_mov_b32 s40, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s41, 7
-; GFX11-NEXT:    s_mov_b32 s41, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s42, 8
-; GFX11-NEXT:    s_mov_b32 s42, s36
-; GFX11-NEXT:    v_writelane_b32 v40, s43, 9
-; GFX11-NEXT:    s_mov_b32 s43, s36
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_clause 0x4
 ; GFX11-NEXT:    scratch_load_b32 v45, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:8
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:12
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s43, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s42, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s41, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s40, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
-; GFX11-NEXT:    v_readlane_b32 s33, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s33, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
Index: llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
+++ llvm/test/CodeGen/AMDGPU/wmma_modifiers.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_cs void @xyz () {
 ; CHECK-LABEL: xyz:
-; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[4:11], v[4:11], v[0:3]
+; CHECK: v_wmma_f32_16x16x16_f16 v[0:3], v[0:7], v[0:7], v[0:3]
 
 .entry:
   br label %loop
Index: llvm/test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/wqm.ll
+++ llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -1833,87 +1833,54 @@
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 ; GFX9-W64-LABEL: test_loop_vcc:
 ; GFX9-W64:       ; %bb.0: ; %entry
-; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-W64-NEXT:    s_mov_b32 s10, 0x40e00000
+; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
 ; GFX9-W64-NEXT:    s_branch .LBB31_2
 ; GFX9-W64-NEXT:  .LBB31_1: ; %body
 ; GFX9-W64-NEXT:    ; in Loop: Header=BB31_2 Depth=1
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
 ; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_4
 ; GFX9-W64-NEXT:  .LBB31_2: ; %loop
 ; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
-; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v8
+; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-W64-NEXT:    s_cbranch_vccz .LBB31_1
 ; GFX9-W64-NEXT:  ; %bb.3:
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], -1
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
 ; GFX9-W64-NEXT:  .LBB31_4: ; %break
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-W32-LABEL: test_loop_vcc:
 ; GFX10-W32:       ; %bb.0: ; %entry
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_branch .LBB31_2
 ; GFX10-W32-NEXT:    .p2align 6
 ; GFX10-W32-NEXT:  .LBB31_1: ; %body
 ; GFX10-W32-NEXT:    ; in Loop: Header=BB31_2 Depth=1
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
-; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_mov_b32 s1, 0
+; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_4
 ; GFX10-W32-NEXT:  .LBB31_2: ; %loop
 ; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1925,11 +1892,10 @@
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX10-W32-NEXT:    s_cbranch_vccz .LBB31_1
 ; GFX10-W32-NEXT:  ; %bb.3:
-; GFX10-W32-NEXT:    s_mov_b32 s1, -1
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
 ; GFX10-W32-NEXT:  .LBB31_4: ; %break
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
@@ -1999,14 +1965,6 @@
 ; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v2, 2, v0
 ; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
@@ -2035,14 +1993,6 @@
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    buffer_load_dword v0, v2, s[8:11], 0 offen
 ; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -2079,18 +2029,10 @@
 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 ; GFX9-W64-LABEL: test_nonvoid_return:
 ; GFX9-W64:       ; %bb.0:
-; GFX9-W64-NEXT:    s_mov_b32 s0, 0
-; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
-; GFX9-W64-NEXT:    s_mov_b32 s1, s0
-; GFX9-W64-NEXT:    s_mov_b32 s2, s0
-; GFX9-W64-NEXT:    s_mov_b32 s3, s0
-; GFX9-W64-NEXT:    s_mov_b32 s4, s0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s0
-; GFX9-W64-NEXT:    s_mov_b32 s6, s0
-; GFX9-W64-NEXT:    s_mov_b32 s7, s0
+; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
 ; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
@@ -2098,18 +2040,10 @@
 ;
 ; GFX10-W32-LABEL: test_nonvoid_return:
 ; GFX10-W32:       ; %bb.0:
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
+; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
@@ -2128,20 +2062,11 @@
 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
 ; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
 ; GFX9-W64:       ; %bb.0: ; %entry
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
-; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
 ; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
-; GFX9-W64-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1
-; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
+; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
+; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB34_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
@@ -2155,20 +2080,11 @@
 ;
 ; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
 ; GFX10-W32:       ; %bb.0: ; %entry
-; GFX10-W32-NEXT:    s_mov_b32 s4, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
-; GFX10-W32-NEXT:    s_mov_b32 s5, s4
-; GFX10-W32-NEXT:    s_mov_b32 s6, s4
-; GFX10-W32-NEXT:    s_mov_b32 s7, s4
-; GFX10-W32-NEXT:    s_mov_b32 s8, s4
-; GFX10-W32-NEXT:    s_mov_b32 s9, s4
-; GFX10-W32-NEXT:    s_mov_b32 s10, s4
-; GFX10-W32-NEXT:    s_mov_b32 s11, s4
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-W32-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
+; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB34_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
@@ -2215,33 +2131,17 @@
 ; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GFX9-W64-NEXT:  ; %bb.1: ; %else
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
-; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_3
 ; GFX9-W64-NEXT:    s_branch .LBB35_4
 ; GFX9-W64-NEXT:  .LBB35_2:
 ; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
 ; GFX9-W64-NEXT:  .LBB35_3: ; %if
-; GFX9-W64-NEXT:    s_mov_b32 s4, 0
-; GFX9-W64-NEXT:    s_mov_b32 s5, s4
-; GFX9-W64-NEXT:    s_mov_b32 s6, s4
-; GFX9-W64-NEXT:    s_mov_b32 s7, s4
-; GFX9-W64-NEXT:    s_mov_b32 s8, s4
-; GFX9-W64-NEXT:    s_mov_b32 s9, s4
-; GFX9-W64-NEXT:    s_mov_b32 s10, s4
-; GFX9-W64-NEXT:    s_mov_b32 s11, s4
 ; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
+; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
 ; GFX9-W64-NEXT:  .LBB35_4: ; %end
 ; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
@@ -2252,21 +2152,13 @@
 ; GFX10-W32-LABEL: test_scc:
 ; GFX10-W32:       ; %bb.0: ; %main_body
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
-; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB35_2
 ; GFX10-W32-NEXT:  ; %bb.1: ; %else
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
 ; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_3
 ; GFX10-W32-NEXT:    s_branch .LBB35_4
@@ -2275,17 +2167,9 @@
 ; GFX10-W32-NEXT:  .LBB35_3: ; %if
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s0, 0
-; GFX10-W32-NEXT:    s_mov_b32 s1, s0
-; GFX10-W32-NEXT:    s_mov_b32 s2, s0
-; GFX10-W32-NEXT:    s_mov_b32 s3, s0
-; GFX10-W32-NEXT:    s_mov_b32 s4, s0
-; GFX10-W32-NEXT:    s_mov_b32 s5, s0
-; GFX10-W32-NEXT:    s_mov_b32 s6, s0
-; GFX10-W32-NEXT:    s_mov_b32 s7, s0
 ; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-W32-NEXT:  .LBB35_4: ; %end
-; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
+; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
 ; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
 ; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)