diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -478,11 +478,7 @@
                                          ? MRI.getRegClass(Reg)
                                          : RI.getPhysRegClass(Reg);
 
-  // FIXME: NumLoads should not be subtracted 1. This is to match behavior
-  // of clusterNeighboringMemOps which was previosly passing cluster length
-  // less 1. LoadClusterThreshold should be tuned instead.
-  return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
-         LoadClusterThreshold;
+  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -162,15 +162,15 @@
 define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 ; GFX6-LABEL: bfe_u32_zextload_i8:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX6-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 0, 8
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
   %load = load i8, i8 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -214,43 +214,44 @@
 ;
 ; GCN-LABEL: sdiv_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s8, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s8
-; GCN-NEXT:    s_xor_b32 s9, s3, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
+; GCN-NEXT:    s_ashr_i32 s8, s5, 31
+; GCN-NEXT:    s_add_i32 s2, s5, s8
+; GCN-NEXT:    s_xor_b32 s11, s2, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GCN-NEXT:    s_ashr_i32 s9, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s9
+; GCN-NEXT:    s_xor_b32 s10, s4, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s3, s3, s8
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s11
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT:    s_xor_b32 s2, s9, s8
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s11
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s2, v1
-; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, s2, v1
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v1
+; GCN-NEXT:    v_cmp_ge_u32_e32 vcc, s10, v1
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
 ; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
@@ -480,20 +481,20 @@
 ;
 ; GCN-LABEL: sdiv_i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s1, s0, 16
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GCN-NEXT:    s_sext_i32_i16 s0, s0
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_ashr_i32 s5, s4, 16
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GCN-NEXT:    s_sext_i32_i16 s4, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
@@ -501,7 +502,7 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
   store i16 %r, i16 addrspace(1)* %out
@@ -691,20 +692,20 @@
 ;
 ; GCN-LABEL: sdiv_i8:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GCN-NEXT:    s_sext_i32_i8 s0, s0
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_bfe_i32 s5, s4, 0x80008
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GCN-NEXT:    s_sext_i32_i8 s4, s4
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
@@ -712,7 +713,7 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
   store i8 %r, i8 addrspace(1)* %out
@@ -1237,14 +1238,14 @@
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s13
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, v1, s13
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
@@ -1868,84 +1869,83 @@
 ; GCN-LABEL: srem_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0xd
+; GCN-NEXT:    s_mov_b32 s20, 0x4f800000
 ; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i32 s2, s16, 31
 ; GCN-NEXT:    s_add_i32 s3, s16, s2
-; GCN-NEXT:    s_xor_b32 s5, s3, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
-; GCN-NEXT:    s_mov_b32 s16, 0x4f800000
+; GCN-NEXT:    s_xor_b32 s16, s3, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s16
 ; GCN-NEXT:    s_ashr_i32 s6, s12, 31
-; GCN-NEXT:    s_ashr_i32 s2, s17, 31
+; GCN-NEXT:    s_ashr_i32 s4, s17, 31
+; GCN-NEXT:    s_add_i32 s2, s12, s6
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_add_i32 s0, s12, s6
-; GCN-NEXT:    s_add_i32 s3, s17, s2
-; GCN-NEXT:    s_xor_b32 s4, s0, s6
-; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
+; GCN-NEXT:    s_add_i32 s5, s17, s4
+; GCN-NEXT:    s_xor_b32 s7, s2, s6
+; GCN-NEXT:    s_xor_b32 s17, s5, s4
+; GCN-NEXT:    v_mul_f32_e32 v0, s20, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s17, s3, s2
-; GCN-NEXT:    s_ashr_i32 s7, s13, 31
-; GCN-NEXT:    s_add_i32 s12, s13, s7
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s5
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s5
-; GCN-NEXT:    s_xor_b32 s12, s12, s7
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s16
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s16
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s17
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s4
-; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[2:3]
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s7
+; GCN-NEXT:    v_mul_f32_e32 v1, s20, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s16
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, s17
 ; GCN-NEXT:    v_mul_hi_u32 v5, v1, s17
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s4, v0
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, s5, v2
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v2
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v0
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s7, v0
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, s16, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s16, v2
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
 ; GCN-NEXT:    s_ashr_i32 s0, s18, 31
+; GCN-NEXT:    s_ashr_i32 s7, s13, 31
 ; GCN-NEXT:    s_add_i32 s1, s18, s0
+; GCN-NEXT:    s_add_i32 s12, s13, s7
 ; GCN-NEXT:    s_xor_b32 s13, s1, s0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s13
+; GCN-NEXT:    s_xor_b32 s12, s12, s7
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, s12
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, v1, s17
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, s16, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, s20, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s12, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v1
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v3
 ; GCN-NEXT:    v_mul_lo_u32 v5, v2, s13
 ; GCN-NEXT:    v_mul_hi_u32 v6, v2, s13
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s17, v3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, s17, v3
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s17, v3
 ; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v5, v5, v2
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s17, v3
 ; GCN-NEXT:    s_ashr_i32 s6, s14, 31
 ; GCN-NEXT:    s_add_i32 s12, s14, s6
-; GCN-NEXT:    s_xor_b32 s12, s12, s6
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v5, v2
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v5, v2
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
@@ -1955,36 +1955,37 @@
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s14
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
+; GCN-NEXT:    s_xor_b32 s12, s12, s6
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, s12
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v1, s7, v1
 ; GCN-NEXT:    v_mul_lo_u32 v2, v2, s13
-; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s7, v1
-; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, s20, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s7, v1
 ; GCN-NEXT:    s_ashr_i32 s7, s15, 31
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s12, v2
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v2
 ; GCN-NEXT:    v_mul_lo_u32 v6, v3, s14
 ; GCN-NEXT:    v_mul_hi_u32 v7, v3, s14
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v2
 ; GCN-NEXT:    s_add_i32 s12, s15, s7
-; GCN-NEXT:    s_xor_b32 s12, s12, s7
 ; GCN-NEXT:    v_sub_i32_e32 v8, vcc, 0, v6
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v7
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v6, v6, v3
+; GCN-NEXT:    s_xor_b32 s12, s12, s7
 ; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, s13, v4
-; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s13, v4
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v6, v3
 ; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v6, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v3, v3, s12
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s13, v4
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, s14
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s12, v3
@@ -2837,20 +2838,20 @@
 ;
 ; GCN-LABEL: sdiv_i3:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
-; GCN-NEXT:    s_bfe_i32 s0, s0, 0x30000
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_bfe_i32 s5, s4, 0x30008
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s5
+; GCN-NEXT:    s_bfe_i32 s4, s4, 0x30000
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GCN-NEXT:    s_xor_b32 s4, s4, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
@@ -2859,7 +2860,7 @@
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
-; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i3 %x, %y
   store i3 %r, i3 addrspace(1)* %out
@@ -2995,21 +2996,21 @@
 ;
 ; GCN-LABEL: udiv_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xd
 ; GCN-NEXT:    s_mov_b32 s8, 0xffff
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s6, s0, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GCN-NEXT:    s_and_b32 s6, s2, s8
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GCN-NEXT:    s_and_b32 s9, s4, s8
+; GCN-NEXT:    s_and_b32 s2, s6, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s6
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_lshr_b32 s0, s2, 16
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GCN-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
@@ -3018,16 +3019,16 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
-; GCN-NEXT:    s_and_b32 s0, s1, s8
+; GCN-NEXT:    s_and_b32 s4, s7, s8
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GCN-NEXT:    s_and_b32 s0, s3, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GCN-NEXT:    s_and_b32 s4, s5, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
@@ -3038,8 +3039,8 @@
 ; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -3259,18 +3260,18 @@
 ;
 ; GCN-LABEL: sdiv_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sext_i32_i16 s9, s2
-; GCN-NEXT:    s_sext_i32_i16 s8, s0
+; GCN-NEXT:    s_sext_i32_i16 s9, s4
+; GCN-NEXT:    s_sext_i32_i16 s8, s6
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GCN-NEXT:    s_xor_b32 s8, s9, s8
-; GCN-NEXT:    s_ashr_i32 s0, s0, 16
+; GCN-NEXT:    s_ashr_i32 s6, s6, 16
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s8, s8, 30
 ; GCN-NEXT:    s_or_b32 s8, s8, 1
@@ -3280,44 +3281,44 @@
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    s_ashr_i32 s2, s2, 16
+; GCN-NEXT:    s_ashr_i32 s4, s4, 16
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
-; GCN-NEXT:    s_xor_b32 s0, s2, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    s_xor_b32 s4, s4, s6
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
-; GCN-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-NEXT:    s_sext_i32_i16 s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    s_sext_i32_i16 s4, s7
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; GCN-NEXT:    s_sext_i32_i16 s1, s3
+; GCN-NEXT:    s_sext_i32_i16 s5, s5
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    s_xor_b32 s4, s5, s4
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
-; GCN-NEXT:    v_mov_b32_e32 v5, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -3409,74 +3410,74 @@
 ;
 ; GCN-LABEL: srem_v3i16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_sext_i32_i16 s8, s2
-; GCN-NEXT:    s_sext_i32_i16 s6, s0
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
-; GCN-NEXT:    s_xor_b32 s6, s8, s6
-; GCN-NEXT:    s_ashr_i32 s6, s6, 30
+; GCN-NEXT:    s_sext_i32_i16 s9, s4
+; GCN-NEXT:    s_sext_i32_i16 s8, s6
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GCN-NEXT:    s_xor_b32 s8, s9, s8
+; GCN-NEXT:    s_ashr_i32 s8, s8, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s6, s6, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_or_b32 s8, s8, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s8
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_alignbit_b32 v2, s7, v2, 16
 ; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
-; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
+; GCN-NEXT:    v_alignbit_b32 v1, s5, v1, 16
 ; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GCN-NEXT:    v_xor_b32_e32 v3, v5, v3
-; GCN-NEXT:    s_sext_i32_i16 s0, s1
+; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
 ; GCN-NEXT:    v_mul_f32_e32 v5, v6, v7
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_mad_f32 v6, -v5, v4, v6
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
+; GCN-NEXT:    s_sext_i32_i16 s4, s7
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
-; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s4
 ; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    s_sext_i32_i16 s2, s3
+; GCN-NEXT:    s_sext_i32_i16 s6, s5
 ; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
-; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v4
-; GCN-NEXT:    s_xor_b32 s0, s2, s0
-; GCN-NEXT:    s_ashr_i32 s0, s0, 30
-; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    s_xor_b32 s4, s6, s4
+; GCN-NEXT:    s_ashr_i32 s4, s4, 30
+; GCN-NEXT:    s_or_b32 s4, s4, 1
 ; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
 ; GCN-NEXT:    v_trunc_f32_e32 v5, v5
 ; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GCN-NEXT:    v_mov_b32_e32 v6, s0
+; GCN-NEXT:    v_mov_b32_e32 v6, s4
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
+; GCN-NEXT:    v_mul_lo_u32 v3, v3, s7
 ; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s5, v3
 ; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
   store <3 x i16> %r, <3 x i16> addrspace(1)* %out
@@ -4081,18 +4082,18 @@
 ;
 ; GCN-LABEL: udiv_i32_oddk_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
@@ -4188,20 +4189,20 @@
 ;
 ; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x100101
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GCN-NEXT:    s_lshr_b32 s0, s0, 12
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GCN-NEXT:    s_lshr_b32 s4, s4, 12
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, <2 x i32> addrspace(1)* %out
@@ -4298,61 +4299,61 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s4, 0x1000
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s2, s4, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    s_lshl_b32 s10, s4, s3
-; GCN-NEXT:    s_mov_b32 s3, 0x4f800000
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
+; GCN-NEXT:    s_lshl_b32 s10, s4, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GCN-NEXT:    s_mov_b32 s2, 0x4f800000
+; GCN-NEXT:    s_lshl_b32 s11, s4, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, s2, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s10
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s10
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
-; GCN-NEXT:    v_mul_hi_u32 v2, v1, s10
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GCN-NEXT:    v_mul_hi_u32 v2, v1, s11
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, v1
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, s10
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v2, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, s9
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v5
-; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s2, v3
+; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, s10
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s11
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s8, v5
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GCN-NEXT:    s_and_b64 vcc, s[2:3], s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s9, v4
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -4369,20 +4370,20 @@
 ;
 ; GCN-LABEL: urem_i32_oddk_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x12d8fb, v0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = urem i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
@@ -4557,6 +4558,7 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s4, 0x1000
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4567,7 +4569,6 @@
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GCN-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -4576,16 +4577,15 @@
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s10
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v3
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, s11
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v2, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
 ; GCN-NEXT:    v_mul_hi_u32 v2, v1, s11
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, s8
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
@@ -4612,6 +4612,7 @@
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -4628,18 +4629,18 @@
 ;
 ; GCN-LABEL: sdiv_i32_oddk_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = sdiv i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
@@ -4680,44 +4681,45 @@
 ;
 ; GCN-LABEL: sdiv_i32_pow2_shl_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GCN-NEXT:    s_ashr_i32 s8, s3, 31
-; GCN-NEXT:    s_add_i32 s3, s3, s8
-; GCN-NEXT:    s_xor_b32 s9, s3, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
-; GCN-NEXT:    s_ashr_i32 s3, s2, 31
-; GCN-NEXT:    s_add_i32 s2, s2, s3
-; GCN-NEXT:    s_xor_b32 s2, s2, s3
+; GCN-NEXT:    s_lshl_b32 s2, 0x1000, s5
+; GCN-NEXT:    s_ashr_i32 s8, s2, 31
+; GCN-NEXT:    s_add_i32 s2, s2, s8
+; GCN-NEXT:    s_xor_b32 s11, s2, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s11
+; GCN-NEXT:    s_ashr_i32 s9, s4, 31
+; GCN-NEXT:    s_add_i32 s4, s4, s9
+; GCN-NEXT:    s_xor_b32 s10, s4, s9
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s3, s3, s8
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s9
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s11
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s11
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v1, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[2:3]
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT:    s_xor_b32 s2, s9, s8
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s11
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s2, v1
-; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[0:1], s10, v1
+; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s10, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v1
 ; GCN-NEXT:    s_and_b64 vcc, vcc, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
@@ -4905,56 +4907,56 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    s_movk_i32 s4, 0x1000
-; GCN-NEXT:    s_mov_b32 s14, 0x4f800000
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s15, 0x4f800000
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0xb
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s2, s4, s2
 ; GCN-NEXT:    s_ashr_i32 s5, s2, 31
 ; GCN-NEXT:    s_add_i32 s2, s2, s5
-; GCN-NEXT:    s_xor_b32 s13, s2, s5
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s13
-; GCN-NEXT:    s_ashr_i32 s2, s6, 31
-; GCN-NEXT:    s_lshl_b32 s0, s4, s3
-; GCN-NEXT:    s_add_i32 s1, s6, s2
+; GCN-NEXT:    s_xor_b32 s14, s2, s5
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GCN-NEXT:    s_lshl_b32 s2, s4, s3
+; GCN-NEXT:    s_ashr_i32 s4, s6, 31
+; GCN-NEXT:    s_add_i32 s3, s6, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GCN-NEXT:    s_ashr_i32 s6, s0, 31
-; GCN-NEXT:    s_add_i32 s4, s0, s6
-; GCN-NEXT:    s_xor_b32 s3, s1, s2
-; GCN-NEXT:    v_mul_f32_e32 v0, s14, v0
+; GCN-NEXT:    s_ashr_i32 s6, s2, 31
+; GCN-NEXT:    s_add_i32 s8, s2, s6
+; GCN-NEXT:    s_xor_b32 s12, s3, s4
+; GCN-NEXT:    v_mul_f32_e32 v0, s15, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    s_xor_b32 s15, s4, s6
-; GCN-NEXT:    s_xor_b32 s12, s2, s5
+; GCN-NEXT:    s_xor_b32 s16, s8, s6
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; GCN-NEXT:    s_xor_b32 s13, s4, s5
+; GCN-NEXT:    v_mul_lo_u32 v1, v0, s14
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s14
 ; GCN-NEXT:    s_mov_b32 s10, -1
-; GCN-NEXT:    v_mul_lo_u32 v1, v0, s13
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s13
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[2:3]
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, v0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s15
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s16
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v0
 ; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[2:3]
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v2
-; GCN-NEXT:    v_mul_hi_u32 v0, v0, s3
-; GCN-NEXT:    v_mul_f32_e32 v1, s14, v1
-; GCN-NEXT:    v_mul_lo_u32 v2, v0, s13
+; GCN-NEXT:    v_mul_hi_u32 v0, v0, s12
+; GCN-NEXT:    v_mul_f32_e32 v1, s15, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, v0, s14
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v0
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v1, s15
-; GCN-NEXT:    v_mul_hi_u32 v5, v1, s15
-; GCN-NEXT:    s_ashr_i32 s13, s7, 31
-; GCN-NEXT:    s_add_i32 s7, s7, s13
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s12, v2
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v1, s16
+; GCN-NEXT:    v_mul_hi_u32 v5, v1, s16
+; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s12, v2
+; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4
 ; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
 ; GCN-NEXT:    v_mul_hi_u32 v4, v4, v1
-; GCN-NEXT:    s_xor_b32 s7, s7, s13
-; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s3, v2
+; GCN-NEXT:    s_add_i32 s7, s7, s12
+; GCN-NEXT:    s_xor_b32 s7, s7, s12
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v4, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, v4, v1
@@ -4963,12 +4965,12 @@
 ; GCN-NEXT:    s_and_b64 vcc, s[0:1], s[2:3]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v2, v1, s15
-; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
-; GCN-NEXT:    s_xor_b32 s4, s13, s6
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, s16
+; GCN-NEXT:    v_xor_b32_e32 v0, s13, v0
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s13, v0
+; GCN-NEXT:    s_xor_b32 s4, s12, s6
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v2
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v3
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s16, v3
 ; GCN-NEXT:    v_cmp_ge_u32_e64 s[2:3], s7, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, -1, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
@@ -4977,6 +4979,7 @@
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[2:3]
 ; GCN-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -4993,20 +4996,20 @@
 ;
 ; GCN-LABEL: srem_i32_oddk_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
+; GCN-NEXT:    v_mul_hi_i32 v0, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT:    v_mul_i32_i24_e32 v0, 0x12d8fb, v0
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %r = srem i32 %x, 1235195
   store i32 %r, i32 addrspace(1)* %out
@@ -5484,13 +5487,13 @@
 ;
 ; GCN-LABEL: udiv_i64_pow2_shl_denom:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_add_i32 s8, s8, 12
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
@@ -5551,79 +5554,79 @@
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s6, 0xf001
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_movk_i32 s0, 0xfff
-; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
-; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s6
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    s_movk_i32 s0, 0xfff
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
-; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
-; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_hi_u32 v4, v0, s6
+; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
+; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
+; GCN-NEXT:    v_mul_lo_u32 v6, v0, s6
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
-; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
-; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GCN-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v10, v2, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[2:3]
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, v1, s0
 ; GCN-NEXT:    v_mul_hi_u32 v3, v0, s0
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, s0
@@ -6333,9 +6336,8 @@
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_movk_i32 s6, 0xf001
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -6364,6 +6366,7 @@
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -6494,8 +6497,8 @@
 ; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_subb_u32 s7, 0, s15
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -7136,8 +7139,8 @@
 ; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_subb_u32 s7, 0, s17
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -330,36 +330,35 @@
 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
 ; SI-LABEL: v_brev_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s6, 0xff00ff
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s2, 0xff00ff
 ; SI-NEXT:    s_mov_b32 s8, 0xf0f0f0f
 ; SI-NEXT:    s_mov_b32 s9, 0xf0f0f0f0
 ; SI-NEXT:    s_mov_b32 s10, 0x33333333
 ; SI-NEXT:    s_mov_b32 s11, 0xcccccccc
-; SI-NEXT:    s_mov_b32 s0, 0x55555555
-; SI-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
+; SI-NEXT:    s_mov_b32 s12, 0x55555555
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
 ; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT:    v_bfi_b32 v2, s6, v0, v2
-; SI-NEXT:    v_bfi_b32 v4, s6, v1, v3
+; SI-NEXT:    v_bfi_b32 v2, s2, v0, v2
+; SI-NEXT:    v_bfi_b32 v4, s2, v1, v3
 ; SI-NEXT:    v_and_b32_e32 v1, s8, v2
 ; SI-NEXT:    v_and_b32_e32 v0, s8, v4
 ; SI-NEXT:    v_and_b32_e32 v3, s9, v2
 ; SI-NEXT:    v_and_b32_e32 v2, s9, v4
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
 ; SI-NEXT:    v_and_b32_e32 v1, s10, v3
@@ -370,64 +369,66 @@
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
 ; SI-NEXT:    v_or_b32_e32 v3, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v2, v2, v0
-; SI-NEXT:    v_and_b32_e32 v1, s0, v3
-; SI-NEXT:    v_and_b32_e32 v0, s0, v2
-; SI-NEXT:    v_and_b32_e32 v3, s1, v3
-; SI-NEXT:    v_and_b32_e32 v2, s1, v2
+; SI-NEXT:    v_and_b32_e32 v1, s12, v3
+; SI-NEXT:    v_and_b32_e32 v0, s12, v2
+; SI-NEXT:    v_and_b32_e32 v3, s2, v3
+; SI-NEXT:    v_and_b32_e32 v2, s2, v2
 ; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
 ; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; FLAT-LABEL: v_brev_i64:
 ; FLAT:       ; %bb.0:
-; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
 ; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; FLAT-NEXT:    s_mov_b32 s6, 0x10203
-; FLAT-NEXT:    s_mov_b32 s2, 0x33333333
-; FLAT-NEXT:    s_mov_b32 s3, 0xcccccccc
+; FLAT-NEXT:    s_mov_b32 s8, 0x10203
+; FLAT-NEXT:    s_mov_b32 s2, 0xf0f0f0f
+; FLAT-NEXT:    s_mov_b32 s6, 0xcccccccc
 ; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_mov_b32_e32 v1, s1
-; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; FLAT-NEXT:    v_mov_b32_e32 v1, s5
+; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
 ; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; FLAT-NEXT:    s_mov_b32 s0, 0xf0f0f0f
-; FLAT-NEXT:    s_mov_b32 s1, 0xf0f0f0f0
-; FLAT-NEXT:    s_mov_b32 s7, 0xf000
+; FLAT-NEXT:    s_mov_b32 s4, 0xf0f0f0f0
+; FLAT-NEXT:    s_mov_b32 s5, 0x33333333
+; FLAT-NEXT:    s_mov_b32 s7, 0x55555555
+; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; FLAT-NEXT:    s_mov_b32 s3, 0xf000
 ; FLAT-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; FLAT-NEXT:    v_perm_b32 v2, 0, v0, s6
-; FLAT-NEXT:    v_perm_b32 v4, 0, v1, s6
-; FLAT-NEXT:    v_and_b32_e32 v1, s0, v2
-; FLAT-NEXT:    v_and_b32_e32 v0, s0, v4
-; FLAT-NEXT:    v_and_b32_e32 v3, s1, v2
-; FLAT-NEXT:    v_and_b32_e32 v2, s1, v4
+; FLAT-NEXT:    v_perm_b32 v2, 0, v0, s8
+; FLAT-NEXT:    v_perm_b32 v4, 0, v1, s8
+; FLAT-NEXT:    v_and_b32_e32 v1, s2, v2
+; FLAT-NEXT:    v_and_b32_e32 v0, s2, v4
+; FLAT-NEXT:    v_and_b32_e32 v3, s4, v2
+; FLAT-NEXT:    v_and_b32_e32 v2, s4, v4
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
-; FLAT-NEXT:    s_mov_b32 s0, 0x55555555
+; FLAT-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
 ; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT:    v_and_b32_e32 v1, s2, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s2, v2
-; FLAT-NEXT:    v_and_b32_e32 v3, s3, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s3, v2
+; FLAT-NEXT:    v_and_b32_e32 v1, s5, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s5, v2
+; FLAT-NEXT:    v_and_b32_e32 v3, s6, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s6, v2
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
-; FLAT-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
 ; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
-; FLAT-NEXT:    v_and_b32_e32 v1, s0, v3
-; FLAT-NEXT:    v_and_b32_e32 v0, s0, v2
-; FLAT-NEXT:    v_and_b32_e32 v3, s1, v3
-; FLAT-NEXT:    v_and_b32_e32 v2, s1, v2
+; FLAT-NEXT:    v_and_b32_e32 v1, s7, v3
+; FLAT-NEXT:    v_and_b32_e32 v0, s7, v2
+; FLAT-NEXT:    v_and_b32_e32 v3, s2, v3
+; FLAT-NEXT:    v_and_b32_e32 v2, s2, v2
 ; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
 ; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
-; FLAT-NEXT:    s_mov_b32 s6, -1
+; FLAT-NEXT:    s_mov_b32 s2, -1
 ; FLAT-NEXT:    v_or_b32_e32 v1, v3, v1
 ; FLAT-NEXT:    v_or_b32_e32 v0, v2, v0
-; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; FLAT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -843,22 +843,22 @@
 
 ; GCN-LABEL: {{^}}stack_8xv5i32:
 
-; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG8]], {{.*$}}
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
+; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
+; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
+; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
+; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
+; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
+; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
+; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
+; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}}
+; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4
+; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8
+; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12
+; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16
+; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20
+; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24
+; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28
 
 ; GCN: v_mov_b32_e32 v31, 7
 ; GCN: s_getpc
@@ -877,22 +877,22 @@
 }
 
 ; GCN-LABEL: {{^}}stack_8xv5f32:
-; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG8]], {{.*$}}
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
+; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
+; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
+; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
+; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
+; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
+; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
+; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
+; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
+; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}}
+; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4
+; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8
+; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12
+; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16
+; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20
+; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24
+; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28
 
 ; GCN: v_mov_b32_e32 v31, 0x40e00000
 ; GCN: s_getpc
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -50,37 +50,38 @@
 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
 ; SI-LABEL: test_copy_v4i8_x2:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s11, 0xf000
-; SI-NEXT:    s_mov_b32 s2, 0
-; SI-NEXT:    s_mov_b32 s3, s11
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s8, s6
-; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    s_mov_b32 s6, s10
-; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_mov_b32 s1, s7
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_mov_b32 s7, s3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_copy_v4i8_x2:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v0, v[0:1]
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_mov_b32 s0, s6
 ; VI-NEXT:    s_mov_b32 s1, s7
 ; VI-NEXT:    s_mov_b32 s6, s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -45,11 +45,11 @@
 
 ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
-; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
-; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
-; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
+; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
+; SI-DAG-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; VI-DAG-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
+; GCN: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -357,29 +357,28 @@
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
-; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
+; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:5
+; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:6
 ; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
-; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
-; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3
-; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:4
+; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2
+; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(4)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
-; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v6
+; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v5
-; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v2
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v2, v7, v6
+; SI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v3
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
-; SI-NEXT:    v_cvt_f32_ubyte0_e32 v8, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v8, 8, v0
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
+; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v2
+; SI-NEXT:    v_or_b32_e32 v2, v8, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
-; SI-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:24
+; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:24
 ; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -407,27 +406,27 @@
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
 ; VI-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; VI-NEXT:    flat_load_ubyte v6, v[6:7]
-; VI-NEXT:    flat_load_ubyte v7, v[8:9]
-; VI-NEXT:    flat_load_ubyte v8, v[10:11]
-; VI-NEXT:    flat_load_ubyte v9, v[12:13]
+; VI-NEXT:    flat_load_ubyte v8, v[8:9]
+; VI-NEXT:    flat_load_ubyte v9, v[10:11]
+; VI-NEXT:    flat_load_ubyte v10, v[12:13]
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    flat_load_ubyte v1, v[2:3]
 ; VI-NEXT:    flat_load_ubyte v2, v[4:5]
+; VI-NEXT:    flat_load_ubyte v3, v[6:7]
 ; VI-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v6
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
 ; VI-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
-; VI-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v8
-; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT:    v_cvt_f32_ubyte2_e32 v5, v9
+; VI-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; VI-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v1
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v3
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v2
-; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v9
+; VI-NEXT:    v_cvt_f32_ubyte2_e32 v1, v10
 ; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
 ; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -2443,12 +2443,12 @@
 }
 
 ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
-; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
-; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
-; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
 
-; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
+; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
 ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1196,9 +1196,9 @@
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
 ; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; SI-NEXT:    s_mov_b32 s4, 0xffffff
 ; SI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
@@ -1212,32 +1212,31 @@
 ; SI-NEXT:    v_and_b32_e32 v2, s4, v2
 ; SI-NEXT:    v_mul_hi_u32 v12, v2, s5
 ; SI-NEXT:    s_waitcnt vmcnt(3)
-; SI-NEXT:    v_and_b32_e32 v3, s4, v3
-; SI-NEXT:    v_mul_hi_u32 v13, v3, s5
-; SI-NEXT:    s_waitcnt vmcnt(2)
-; SI-NEXT:    v_and_b32_e32 v11, s4, v4
+; SI-NEXT:    v_and_b32_e32 v11, s4, v3
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_and_b32_e32 v5, s4, v5
+; SI-NEXT:    v_mul_hi_u32 v13, v5, s5
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
 ; SI-NEXT:    v_mul_lo_u32 v12, v12, 24
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
 ; SI-NEXT:    v_mul_lo_u32 v13, v13, 24
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
 ; SI-NEXT:    v_lshr_b32_e32 v12, v14, v2
-; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v13
+; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v13
 ; SI-NEXT:    v_sub_i32_e32 v13, vcc, 24, v2
-; SI-NEXT:    v_sub_i32_e32 v14, vcc, 24, v3
+; SI-NEXT:    v_sub_i32_e32 v14, vcc, 24, v5
 ; SI-NEXT:    v_and_b32_e32 v13, s4, v13
-; SI-NEXT:    s_waitcnt vmcnt(1)
-; SI-NEXT:    v_lshl_b32_e32 v5, v5, v13
+; SI-NEXT:    v_lshl_b32_e32 v4, v4, v13
 ; SI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
-; SI-NEXT:    v_lshr_b32_e32 v11, v11, v3
+; SI-NEXT:    v_lshr_b32_e32 v11, v11, v5
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshl_b32_e32 v6, v6, v14
-; SI-NEXT:    v_or_b32_e32 v5, v5, v12
+; SI-NEXT:    v_or_b32_e32 v4, v4, v12
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; SI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; SI-NEXT:    v_or_b32_e32 v6, v6, v11
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; SI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
 ; SI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
@@ -1256,9 +1255,9 @@
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20
 ; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
 ; VI-NEXT:    s_mov_b32 s4, 0xffffff
 ; VI-NEXT:    s_mov_b32 s5, 0xaaaaaaab
@@ -1272,32 +1271,31 @@
 ; VI-NEXT:    v_and_b32_e32 v2, s4, v2
 ; VI-NEXT:    v_mul_hi_u32 v12, v2, s5
 ; VI-NEXT:    s_waitcnt vmcnt(3)
-; VI-NEXT:    v_and_b32_e32 v3, s4, v3
-; VI-NEXT:    v_mul_hi_u32 v13, v3, s5
-; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_and_b32_e32 v11, s4, v4
+; VI-NEXT:    v_and_b32_e32 v11, s4, v3
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_and_b32_e32 v5, s4, v5
+; VI-NEXT:    v_mul_hi_u32 v13, v5, s5
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
 ; VI-NEXT:    v_mul_lo_u32 v12, v12, 24
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, 4, v13
 ; VI-NEXT:    v_mul_lo_u32 v13, v13, 24
 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, v2, v14
-; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v13
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, v5, v13
 ; VI-NEXT:    v_sub_u32_e32 v13, vcc, 24, v2
-; VI-NEXT:    v_sub_u32_e32 v14, vcc, 24, v3
+; VI-NEXT:    v_sub_u32_e32 v14, vcc, 24, v5
 ; VI-NEXT:    v_and_b32_e32 v13, s4, v13
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshlrev_b32_e32 v5, v13, v5
+; VI-NEXT:    v_lshlrev_b32_e32 v4, v13, v4
 ; VI-NEXT:    v_and_b32_e32 v14, 0xffffff, v14
-; VI-NEXT:    v_lshrrev_b32_e32 v11, v3, v11
+; VI-NEXT:    v_lshrrev_b32_e32 v11, v5, v11
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v14, v6
-; VI-NEXT:    v_or_b32_e32 v5, v5, v12
+; VI-NEXT:    v_or_b32_e32 v4, v4, v12
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; VI-NEXT:    v_or_b32_e32 v6, v6, v11
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; VI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
 ; VI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
@@ -1314,10 +1312,10 @@
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32
-; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_mov_b32 s4, 0xffffff
 ; GFX9-NEXT:    s_mov_b32 s5, 0xaaaaaaab
 ; GFX9-NEXT:    s_waitcnt vmcnt(5)
@@ -1325,31 +1323,29 @@
 ; GFX9-NEXT:    s_waitcnt vmcnt(4)
 ; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v2, s5
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, s5
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, s4, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, s5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v9, s4, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, v2, v10
-; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v7
+; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 24, v2
-; GFX9-NEXT:    v_sub_u32_e32 v10, 24, v3
+; GFX9-NEXT:    v_sub_u32_e32 v10, 24, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, s4, v7
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v3, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v4, v9
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffffff, v10
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshl_or_b32 v5, v5, v7, v6
+; GFX9-NEXT:    v_lshl_or_b32 v3, v3, v7, v6
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v6, v8, v10, v9
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v5, v5, v10, v9
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v8, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
 ; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -830,9 +830,9 @@
 ; GFX7-LABEL: notudot2_SameVec:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[0:1], 0x0
@@ -2546,90 +2546,90 @@
 define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
 ; GFX7-LABEL: udot2_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX7-NEXT:    s_lshr_b32 s7, s5, 16
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
+; GFX7-NEXT:    s_and_b32 s0, s0, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_acc16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
+; GFX8-NEXT:    s_and_b32 s3, s2, s1
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    s_and_b32 s1, s0, s1
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot2_acc16:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s1
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT:    s_and_b32 s1, s0, s1
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot2_u32_u16 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2681,20 +2681,20 @@
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s8, s6
 ; GFX7-NEXT:    s_mov_b32 s9, s7
+; GFX7-NEXT:    s_mov_b32 s11, s3
 ; GFX7-NEXT:    s_mov_b32 s6, s2
 ; GFX7-NEXT:    s_mov_b32 s7, s3
-; GFX7-NEXT:    s_mov_b32 s11, s3
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
-; GFX7-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX7-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
 ; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_bfe_i32 v2, v0, 0, 8
+; GFX7-NEXT:    v_bfe_i32 v3, v0, 0, 8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_bfe_i32 v3, v1, 0, 8
-; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v2, v1, 0, 8
 ; GFX7-NEXT:    v_bfe_i32 v1, v1, 8, 8
+; GFX7-NEXT:    v_bfe_i32 v0, v0, 8, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v0, v1, v0, s4
+; GFX7-NEXT:    v_mad_i32_i24 v0, v0, v1, s4
 ; GFX7-NEXT:    v_mad_i32_i24 v0, v3, v2, v0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -177,60 +177,60 @@
 define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: idot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_sext_i32_i8 s6, s4
-; GFX7-NEXT:    s_sext_i32_i8 s7, s5
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80008
-; GFX7-NEXT:    s_and_b32 s7, s7, s8
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x80010
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s2, s0
+; GFX7-NEXT:    s_sext_i32_i8 s3, s1
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x80008
+; GFX7-NEXT:    s_and_b32 s3, s3, s8
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x80008
 ; GFX7-NEXT:    s_and_b32 s10, s10, s8
-; GFX7-NEXT:    s_and_b32 s6, s6, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x80010
-; GFX7-NEXT:    s_ashr_i32 s5, s5, 24
+; GFX7-NEXT:    s_and_b32 s2, s2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x80010
+; GFX7-NEXT:    s_ashr_i32 s1, s1, 24
 ; GFX7-NEXT:    s_and_b32 s12, s12, s8
 ; GFX7-NEXT:    s_and_b32 s9, s9, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    s_ashr_i32 s4, s4, 24
+; GFX7-NEXT:    s_ashr_i32 s0, s0, 24
 ; GFX7-NEXT:    s_and_b32 s11, s11, s8
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s12
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_and_b32 s0, s0, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot4_acc16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x80008
+; GFX8-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX8-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX8-NEXT:    s_bfe_i32 s6, s0, 0x80010
@@ -248,20 +248,20 @@
 ;
 ; GFX9-NODL-LABEL: idot4_acc16:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s2
 ; GFX9-NODL-NEXT:    s_bfe_i32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    s_bfe_i32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-NODL-NEXT:    s_bfe_i32 s4, s0, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX9-NODL-NEXT:    s_bfe_i32 s6, s0, 0x80010
@@ -279,18 +279,18 @@
 ;
 ; GFX9-DL-LABEL: idot4_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_i32_i8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -352,114 +352,114 @@
 define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: idot4_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_movk_i32 s1, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX7-NEXT:    s_and_b32 s3, s2, s1
+; GFX7-NEXT:    s_bfe_u32 s8, s2, 0x80008
+; GFX7-NEXT:    s_and_b32 s1, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_u32 s10, s2, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot4_acc8:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GFX8-NEXT:    s_and_b32 s3, s2, s1
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX8-NEXT:    s_and_b32 s1, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: idot4_acc8:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_movk_i32 s1, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s1
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s1, s0, s1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot4_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -894,45 +894,46 @@
 define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: idot4_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_ashr_i32 s6, s4, 24
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x80010
-; GFX7-NEXT:    s_bfe_i32 s11, s5, 0x80008
-; GFX7-NEXT:    s_ashr_i32 s9, s5, 24
-; GFX7-NEXT:    s_sext_i32_i8 s5, s5
-; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x80010
-; GFX7-NEXT:    s_bfe_i32 s8, s4, 0x80008
-; GFX7-NEXT:    s_sext_i32_i8 s4, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_ashr_i32 s2, s0, 24
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s11, s1, 0x80008
+; GFX7-NEXT:    s_ashr_i32 s9, s1, 24
+; GFX7-NEXT:    s_sext_i32_i8 s1, s1
+; GFX7-NEXT:    s_bfe_i32 s3, s0, 0x80010
+; GFX7-NEXT:    s_bfe_i32 s8, s0, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_i32_i24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_i32_i24 v0, s8, v2, v0
-; GFX7-NEXT:    v_mad_i32_i24 v0, s7, v3, v0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s3, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_mad_i32_i24 v0, s6, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_i32_i24 v0, s2, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot4_acc16_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s0
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -179,114 +179,114 @@
 define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_movk_i32 s1, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX7-NEXT:    s_and_b32 s3, s2, s1
+; GFX7-NEXT:    s_bfe_u32 s8, s2, 0x80008
+; GFX7-NEXT:    s_and_b32 s1, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_u32 s10, s2, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_acc16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_movk_i32 s3, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_and_b32 s1, s0, s3
+; GFX8-NEXT:    s_and_b32 s3, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s5
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s4, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot4_acc16:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_movk_i32 s3, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s1, s0, s3
+; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s3
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s7
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s4, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -349,114 +349,114 @@
 define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_movk_i32 s5, 0xff
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_movk_i32 s1, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s6, s5
-; GFX7-NEXT:    s_bfe_u32 s8, s6, 0x80008
-; GFX7-NEXT:    s_and_b32 s5, s4, s5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80010
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x80008
+; GFX7-NEXT:    s_and_b32 s3, s2, s1
+; GFX7-NEXT:    s_bfe_u32 s8, s2, 0x80008
+; GFX7-NEXT:    s_and_b32 s1, s0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_u32 s10, s2, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_acc8:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_movk_i32 s1, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GFX8-NEXT:    s_and_b32 s3, s2, s1
+; GFX8-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX8-NEXT:    s_and_b32 s1, s0, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot4_acc8:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_movk_i32 s1, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s1
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GFX9-NODL-NEXT:    s_and_b32 s1, s0, s1
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s2, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s0, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s0, 0x80010
 ; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -512,97 +512,100 @@
 define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot2_8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s7, s4, s8
-; GFX7-NEXT:    s_and_b32 s6, s5, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x80008
+; GFX7-NEXT:    s_and_b32 s3, s0, s8
+; GFX7-NEXT:    s_and_b32 s2, s1, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot2_8:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s2, s0
-; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x80008
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s3, s0, s4
+; GFX8-NEXT:    s_and_b32 s2, s1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot2_8:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s2, s2, 0x80008
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_and_b32 s3, s0, s4
+; GFX9-NODL-NEXT:    s_and_b32 s2, s1, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NODL-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot2_8:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT:    s_and_b32 s0, s1, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s2, s2, 0x80008
+; GFX9-DL-NEXT:    s_and_b32 s3, s0, s4
+; GFX9-DL-NEXT:    s_and_b32 s2, s1, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -653,114 +656,116 @@
 define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_CommutationInsideMAD:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s6, s4, s8
-; GFX7-NEXT:    s_and_b32 s7, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    s_and_b32 s2, s0, s8
+; GFX7-NEXT:    s_and_b32 s3, s1, s8
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x80008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80010
+; GFX7-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s8
-; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_bfe_u32 s11, s1, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_CommutationInsideMAD:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s2, s0, s4
+; GFX8-NEXT:    s_and_b32 s3, s1, s4
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_and_b32 s2, s0, s4
+; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s4
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s3, v3, v2
+; GFX9-DL-NEXT:    v_dot4_u32_u8 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -816,129 +821,132 @@
 define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_CommutationAccrossMADs:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b32 s6, s4, s8
-; GFX7-NEXT:    s_and_b32 s7, s5, s8
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s9, s5, 0x80008
+; GFX7-NEXT:    s_and_b32 s2, s0, s8
+; GFX7-NEXT:    s_and_b32 s3, s1, s8
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s9, s1, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80010
-; GFX7-NEXT:    v_mov_b32_e32 v2, s6
-; GFX7-NEXT:    s_bfe_u32 s11, s5, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80010
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    s_bfe_u32 s11, s1, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX7-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v2, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_CommutationAccrossMADs:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT:    s_and_b32 s3, s1, s0
-; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s2, s0, s4
+; GFX8-NEXT:    s_and_b32 s3, s1, s4
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s3
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
-; GFX8-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX8-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
+; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v4, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s0
+; GFX8-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NODL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT:    s_and_b32 s2, s0, s4
+; GFX9-NODL-NEXT:    s_and_b32 s3, s1, s4
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX9-NODL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-NODL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX9-NODL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s3, v4, v2
 ; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NODL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-NODL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NODL-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_movk_i32 s4, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT:    s_and_b32 s3, s1, s0
-; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_and_b32 s2, s0, s4
+; GFX9-DL-NEXT:    s_and_b32 s3, s1, s4
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x80008
+; GFX9-DL-NEXT:    s_bfe_u32 s5, s1, 0x80008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT:    s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT:    s_and_b32 s0, s2, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
-; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-DL-NEXT:    s_lshr_b32 s1, s1, 24
+; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x80010
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-DL-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX9-DL-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s6
-; GFX9-DL-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT:    s_lshr_b32 s1, s1, 24
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s0, v4, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s3, v4, v2
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s7, v5, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-DL-NEXT:    v_mad_u32_u24 v2, s1, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1417,54 +1425,54 @@
 define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: notdot4_mixedtypes:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_sext_i32_i8 s7, s6
-; GFX7-NEXT:    s_bfe_u32 s9, s6, 0x80008
-; GFX7-NEXT:    s_sext_i32_i8 s5, s4
-; GFX7-NEXT:    s_and_b32 s7, s7, s8
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s3, s2
+; GFX7-NEXT:    s_bfe_u32 s9, s2, 0x80008
+; GFX7-NEXT:    s_sext_i32_i8 s1, s0
+; GFX7-NEXT:    s_and_b32 s3, s3, s8
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x80008
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    s_bfe_u32 s11, s6, 0x80010
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
-; GFX7-NEXT:    v_mov_b32_e32 v3, s7
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s6, s6, 24
+; GFX7-NEXT:    s_bfe_u32 s11, s2, 0x80010
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s2, s2, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v1, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v3, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: notdot4_mixedtypes:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX8-NEXT:    s_sext_i32_i8 s3, s2
+; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX8-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x80010
@@ -1482,20 +1490,20 @@
 ;
 ; GFX9-NODL-LABEL: notdot4_mixedtypes:
 ; GFX9-NODL:       ; %bb.0: ; %entry
-; GFX9-NODL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NODL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NODL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NODL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NODL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-NODL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NODL-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s3, s2
+; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NODL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-NODL-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX9-NODL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-NODL-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NODL-NEXT:    s_bfe_u32 s6, s0, 0x80010
@@ -1513,20 +1521,20 @@
 ;
 ; GFX9-DL-LABEL: notdot4_mixedtypes:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_bfe_u32 s5, s2, 0x80008
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s3, s2
+; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-DL-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x80008
 ; GFX9-DL-NEXT:    s_sext_i32_i8 s1, s0
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-DL-NEXT:    s_bfe_u32 s6, s0, 0x80010
@@ -1801,64 +1809,65 @@
 define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_movk_i32 s7, 0xff
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_movk_i32 s3, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s10, s6, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s12, s6, 0x80010
-; GFX7-NEXT:    s_lshr_b32 s9, s6, 24
-; GFX7-NEXT:    s_and_b32 s6, s6, s7
-; GFX7-NEXT:    s_lshr_b32 s5, s4, 24
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x80010
-; GFX7-NEXT:    s_and_b32 s4, s4, s7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s6
+; GFX7-NEXT:    s_bfe_u32 s10, s2, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s12, s2, 0x80010
+; GFX7-NEXT:    s_lshr_b32 s9, s2, 24
+; GFX7-NEXT:    s_and_b32 s2, s2, s3
+; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x80010
+; GFX7-NEXT:    s_and_b32 s0, s0, s3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s9
-; GFX7-NEXT:    v_mad_u32_u24 v0, s5, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s1, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_acc16_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b32 s6, s1, s0
-; GFX8-NEXT:    s_and_b32 s0, s2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s0
-; GFX8-NEXT:    s_bfe_u32 s7, s2, 0x80010
-; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s1
-; GFX8-NEXT:    s_lshr_b32 s4, s2, 24
-; GFX8-NEXT:    s_bfe_u32 s5, s1, 0x80010
-; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_and_b32 s6, s0, s4
+; GFX8-NEXT:    s_and_b32 s4, s1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x80010
+; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 8, s0
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 24
+; GFX8-NEXT:    s_bfe_u32 s5, s0, 0x80010
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s6, v5, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v3, v2
 ; GFX8-NEXT:    v_mad_u32_u24 v2, s5, v6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s4
-; GFX8-NEXT:    v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mad_u32_u24 v2, s2, v3, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2000,35 +2009,35 @@
 define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
 ; GFX7-LABEL: udot4_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x80008
-; GFX7-NEXT:    s_bfe_u32 s10, s5, 0x80008
-; GFX7-NEXT:    s_lshr_b32 s11, s5, 16
-; GFX7-NEXT:    s_lshr_b32 s12, s5, 24
+; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x80008
+; GFX7-NEXT:    s_bfe_u32 s10, s1, 0x80008
+; GFX7-NEXT:    s_lshr_b32 s11, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s12, s1, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s10
-; GFX7-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX7-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s11
-; GFX7-NEXT:    s_lshr_b32 s9, s4, 24
+; GFX7-NEXT:    s_lshr_b32 s9, s0, 24
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s12
-; GFX7-NEXT:    s_mul_i32 s4, s4, s5
+; GFX7-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s9, v1
-; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s7, v2
-; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s6, v3
-; GFX7-NEXT:    s_and_b32 s5, s4, s8
+; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s3, v2
+; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s2, v3
+; GFX7-NEXT:    s_and_b32 s1, s0, s8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT:    v_or_b32_e32 v2, s5, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, s1, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -2036,25 +2045,26 @@
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot4_acc8_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_movk_i32 s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX8-NEXT:    s_lshr_b32 s4, s1, 24
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -254,78 +254,79 @@
 define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
-; GFX7-NEXT:    s_and_b32 s7, s7, s8
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s3, s1, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
+; GFX7-NEXT:    s_and_b32 s3, s3, s8
+; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
 ; GFX7-NEXT:    s_and_b32 s10, s10, s8
-; GFX7-NEXT:    s_and_b32 s6, s6, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_and_b32 s2, s2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
 ; GFX7-NEXT:    s_and_b32 s12, s12, s8
 ; GFX7-NEXT:    s_and_b32 s9, s9, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
 ; GFX7-NEXT:    s_and_b32 s14, s14, s8
 ; GFX7-NEXT:    s_and_b32 s11, s11, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s12
-; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
 ; GFX7-NEXT:    s_and_b32 s16, s16, s8
 ; GFX7-NEXT:    s_and_b32 s13, s13, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s14
-; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
 ; GFX7-NEXT:    s_and_b32 s18, s18, s8
 ; GFX7-NEXT:    s_and_b32 s15, s15, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s16
-; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
 ; GFX7-NEXT:    s_and_b32 s20, s20, s8
 ; GFX7-NEXT:    s_and_b32 s17, s17, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX7-NEXT:    s_and_b32 s19, s19, s8
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s20
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_and_b32 s0, s0, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40000
 ; GFX8-NEXT:    s_bfe_i32 s5, s1, 0x40000
@@ -369,14 +370,15 @@
 ;
 ; GFX9-LABEL: idot8_acc16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40000
 ; GFX9-NEXT:    s_bfe_i32 s5, s1, 0x40000
@@ -420,14 +422,15 @@
 ;
 ; GFX9-DL-LABEL: idot8_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
 ; GFX9-DL-NEXT:    s_bfe_i32 s5, s1, 0x40000
@@ -589,89 +592,89 @@
 define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s7, s5, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s10, s5, 0x40004
-; GFX7-NEXT:    s_and_b32 s7, s7, s8
-; GFX7-NEXT:    s_bfe_i32 s9, s4, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s12, s5, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s3, s1, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s10, s1, 0x40004
+; GFX7-NEXT:    s_and_b32 s3, s3, s8
+; GFX7-NEXT:    s_bfe_i32 s9, s0, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s12, s1, 0x40008
 ; GFX7-NEXT:    s_and_b32 s10, s10, s8
-; GFX7-NEXT:    s_and_b32 s6, s6, s8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s7
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s14, s5, 0x4000c
+; GFX7-NEXT:    s_and_b32 s2, s2, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s14, s1, 0x4000c
 ; GFX7-NEXT:    s_and_b32 s12, s12, s8
 ; GFX7-NEXT:    s_and_b32 s9, s9, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s10
-; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40010
 ; GFX7-NEXT:    s_and_b32 s14, s14, s8
 ; GFX7-NEXT:    s_and_b32 s11, s11, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s12
-; GFX7-NEXT:    s_bfe_i32 s15, s4, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s15, s0, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x40014
 ; GFX7-NEXT:    s_and_b32 s16, s16, s8
 ; GFX7-NEXT:    s_and_b32 s13, s13, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s14
-; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40018
-; GFX7-NEXT:    s_bfe_i32 s17, s4, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s17, s0, 0x40014
 ; GFX7-NEXT:    s_and_b32 s18, s18, s8
 ; GFX7-NEXT:    s_and_b32 s15, s15, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s16
-; GFX7-NEXT:    s_bfe_i32 s19, s4, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    s_bfe_i32 s19, s0, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
 ; GFX7-NEXT:    s_and_b32 s20, s20, s8
 ; GFX7-NEXT:    s_and_b32 s17, s17, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s18
-; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
+; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX7-NEXT:    s_and_b32 s19, s19, s8
-; GFX7-NEXT:    s_and_b32 s5, s5, s8
+; GFX7-NEXT:    s_and_b32 s1, s1, s8
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s20
-; GFX7-NEXT:    s_and_b32 s4, s4, s8
+; GFX7-NEXT:    s_and_b32 s0, s0, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s13, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s15, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s17, v6, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s19, v7, v0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc8:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_movk_i32 s6, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_i32 s6, s3, 0x40000
-; GFX8-NEXT:    s_lshr_b32 s4, s3, 12
-; GFX8-NEXT:    s_bfe_i32 s8, s3, 0x40004
-; GFX8-NEXT:    s_bfe_i32 s10, s3, 0x40008
+; GFX8-NEXT:    s_bfe_i32 s5, s2, 0x40000
 ; GFX8-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX8-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    s_lshr_b32 s3, s2, 12
+; GFX8-NEXT:    s_bfe_i32 s8, s2, 0x40004
+; GFX8-NEXT:    s_bfe_i32 s10, s2, 0x40008
+; GFX8-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX8-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX8-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX8-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
 ; GFX8-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX8-NEXT:    s_bfe_i32 s9, s0, 0x40008
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
@@ -679,53 +682,53 @@
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX8-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX8-NEXT:    s_bfe_i32 s12, s3, 0x40010
-; GFX8-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX8-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX8-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX8-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX8-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX8-NEXT:    s_bfe_i32 s14, s2, 0x40014
 ; GFX8-NEXT:    s_bfe_i32 s11, s0, 0x40010
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
-; GFX8-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX8-NEXT:    s_bfe_i32 s16, s2, 0x40018
 ; GFX8-NEXT:    s_bfe_i32 s13, s0, 0x40014
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s14
 ; GFX8-NEXT:    s_bfe_i32 s15, s0, 0x40018
-; GFX8-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX8-NEXT:    s_ashr_i32 s2, s2, 28
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX8-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX8-NEXT:    v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX8-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: idot8_acc8:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_movk_i32 s6, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_i32 s6, s3, 0x40000
-; GFX9-NEXT:    s_lshr_b32 s4, s3, 12
-; GFX9-NEXT:    s_bfe_i32 s8, s3, 0x40004
-; GFX9-NEXT:    s_bfe_i32 s10, s3, 0x40008
+; GFX9-NEXT:    s_bfe_i32 s5, s2, 0x40000
 ; GFX9-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX9-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-NEXT:    s_lshr_b32 s3, s2, 12
+; GFX9-NEXT:    s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT:    s_bfe_i32 s10, s2, 0x40008
+; GFX9-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX9-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
 ; GFX9-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX9-NEXT:    s_bfe_i32 s9, s0, 0x40008
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
@@ -733,53 +736,53 @@
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-NEXT:    s_bfe_i32 s12, s3, 0x40010
-; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX9-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX9-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX9-NEXT:    s_bfe_i32 s14, s2, 0x40014
 ; GFX9-NEXT:    s_bfe_i32 s11, s0, 0x40010
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX9-NEXT:    s_bfe_i32 s16, s2, 0x40018
 ; GFX9-NEXT:    s_bfe_i32 s13, s0, 0x40014
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s14
 ; GFX9-NEXT:    s_bfe_i32 s15, s0, 0x40018
-; GFX9-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-NEXT:    s_ashr_i32 s2, s2, 28
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX9-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX9-DL-LABEL: idot8_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_movk_i32 s2, 0xff
+; GFX9-DL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_movk_i32 s6, 0xff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
+; GFX9-DL-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_i32 s6, s3, 0x40000
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s3, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s8, s3, 0x40004
-; GFX9-DL-NEXT:    s_bfe_i32 s10, s3, 0x40008
+; GFX9-DL-NEXT:    s_bfe_i32 s5, s2, 0x40000
 ; GFX9-DL-NEXT:    s_lshr_b32 s1, s0, 12
-; GFX9-DL-NEXT:    s_bfe_i32 s5, s0, 0x40000
-; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s2, 12
+; GFX9-DL-NEXT:    s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT:    s_bfe_i32 s10, s2, 0x40008
+; GFX9-DL-NEXT:    s_bfe_i32 s4, s0, 0x40000
+; GFX9-DL-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s4
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v5, 12, s3
 ; GFX9-DL-NEXT:    s_bfe_i32 s7, s0, 0x40004
 ; GFX9-DL-NEXT:    s_bfe_i32 s9, s0, 0x40008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
@@ -787,28 +790,28 @@
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v4, 12, v4
 ; GFX9-DL-NEXT:    v_ashrrev_i16_e32 v5, 12, v5
 ; GFX9-DL-NEXT:    v_mul_i32_i24_e32 v3, s9, v3
-; GFX9-DL-NEXT:    s_bfe_i32 s12, s3, 0x40010
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s2, v5
-; GFX9-DL-NEXT:    s_bfe_i32 s14, s3, 0x40014
+; GFX9-DL-NEXT:    s_bfe_i32 s12, s2, 0x40010
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s6, v4
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, s6, v5
+; GFX9-DL-NEXT:    s_bfe_i32 s14, s2, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_i32 s11, s0, 0x40010
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v8, s12
-; GFX9-DL-NEXT:    s_bfe_i32 s16, s3, 0x40018
+; GFX9-DL-NEXT:    s_bfe_i32 s16, s2, 0x40018
 ; GFX9-DL-NEXT:    s_bfe_i32 s13, s0, 0x40014
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s14
 ; GFX9-DL-NEXT:    s_bfe_i32 s15, s0, 0x40018
-; GFX9-DL-NEXT:    s_ashr_i32 s3, s3, 28
+; GFX9-DL-NEXT:    s_ashr_i32 s2, s2, 28
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s16
 ; GFX9-DL-NEXT:    s_ashr_i32 s0, s0, 28
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s5, v6, v2
+; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s4, v6, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s7, v7, v2
 ; GFX9-DL-NEXT:    v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v4, v5, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s11, v8, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s13, v9, v2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s15, v10, v2
-; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-DL-NEXT:    v_mad_i32_i24 v2, s0, v3, v2
 ; GFX9-DL-NEXT:    global_store_byte v[0:1], v2, off
 ; GFX9-DL-NEXT:    s_endpgm
@@ -1580,14 +1583,14 @@
 ; GFX7-LABEL: idot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s8, 0xffff
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x0
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bfe_i32 s15, s6, 0x40018
 ; GFX7-NEXT:    s_bfe_i32 s16, s6, 0x40014
@@ -1639,20 +1642,20 @@
 ;
 ; GFX8-LABEL: idot8_acc16_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[2:3], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshl_b32 s27, s3, 28
 ; GFX8-NEXT:    s_ashr_i64 s[16:17], s[2:3], 60
+; GFX8-NEXT:    s_lshl_b32 s15, s1, 28
 ; GFX8-NEXT:    s_lshl_b32 s19, s3, 8
 ; GFX8-NEXT:    s_lshl_b32 s21, s3, 12
-; GFX8-NEXT:    s_lshl_b32 s15, s1, 28
 ; GFX8-NEXT:    s_lshl_b32 s23, s3, 16
 ; GFX8-NEXT:    s_lshl_b32 s25, s3, 24
 ; GFX8-NEXT:    s_lshl_b32 s17, s3, 4
@@ -1938,49 +1941,49 @@
 define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: idot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_movk_i32 s8, 0xff
-; GFX7-NEXT:    s_mov_b32 s9, 0xffff
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_mov_b32 s9, 0xffff
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_i32 s6, s4, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s15, s5, 0x40000
-; GFX7-NEXT:    s_bfe_i32 s16, s5, 0x40004
-; GFX7-NEXT:    s_bfe_i32 s17, s5, 0x40008
-; GFX7-NEXT:    s_bfe_i32 s18, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_i32 s19, s5, 0x40010
-; GFX7-NEXT:    s_bfe_i32 s20, s5, 0x40014
-; GFX7-NEXT:    s_bfe_i32 s21, s5, 0x40018
-; GFX7-NEXT:    s_ashr_i32 s5, s5, 28
+; GFX7-NEXT:    s_bfe_i32 s2, s0, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s15, s1, 0x40000
+; GFX7-NEXT:    s_bfe_i32 s16, s1, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s17, s1, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s18, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s19, s1, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s20, s1, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s21, s1, 0x40018
+; GFX7-NEXT:    s_ashr_i32 s1, s1, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s15
-; GFX7-NEXT:    s_bfe_i32 s7, s4, 0x40004
+; GFX7-NEXT:    s_bfe_i32 s3, s0, 0x40004
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s16
-; GFX7-NEXT:    s_bfe_i32 s10, s4, 0x40008
+; GFX7-NEXT:    s_bfe_i32 s10, s0, 0x40008
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s17
-; GFX7-NEXT:    s_bfe_i32 s11, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_i32 s11, s0, 0x4000c
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s18
-; GFX7-NEXT:    s_bfe_i32 s12, s4, 0x40010
+; GFX7-NEXT:    s_bfe_i32 s12, s0, 0x40010
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s19
-; GFX7-NEXT:    s_bfe_i32 s13, s4, 0x40014
+; GFX7-NEXT:    s_bfe_i32 s13, s0, 0x40014
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s20
-; GFX7-NEXT:    s_bfe_i32 s14, s4, 0x40018
+; GFX7-NEXT:    s_bfe_i32 s14, s0, 0x40018
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s21
-; GFX7-NEXT:    s_ashr_i32 s4, s4, 28
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s4, v1
+; GFX7-NEXT:    s_ashr_i32 s0, s0, 28
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mul_i32_i24_e32 v1, s0, v1
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v2, s14, v2
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v3, s13, v3
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v9, s12, v4
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v5, s11, v5
 ; GFX7-NEXT:    v_mul_i32_i24_e32 v6, s10, v6
-; GFX7-NEXT:    v_mul_i32_i24_e32 v7, s7, v7
-; GFX7-NEXT:    v_mul_i32_i24_e32 v8, s6, v8
+; GFX7-NEXT:    v_mul_i32_i24_e32 v7, s3, v7
+; GFX7-NEXT:    v_mul_i32_i24_e32 v8, s2, v8
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, s8, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
@@ -2014,20 +2017,21 @@
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: idot8_acc8_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_mov_b32 s33, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s3, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s1, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshl_b32 s11, s1, 24
 ; GFX8-NEXT:    s_lshl_b32 s15, s1, 16
@@ -2103,15 +2107,16 @@
 ;
 ; GFX9-LABEL: idot8_acc8_vecMul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 4
 ; GFX9-NEXT:    s_lshr_b32 s14, s1, 4
@@ -2138,21 +2143,21 @@
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 20
-; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 20
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s1, 20
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 16
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
+; GFX9-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
+; GFX9-NEXT:    v_lshlrev_b16_e64 v11, 12, s2
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
 ; GFX9-NEXT:    s_lshr_b32 s5, s0, 28
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-NEXT:    s_lshr_b32 s12, s1, 28
 ; GFX9-NEXT:    s_lshr_b32 s13, s1, 24
-; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
 ; GFX9-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
@@ -2174,7 +2179,7 @@
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX9-NEXT:    v_or_b32_e32 v6, v4, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
@@ -2191,15 +2196,16 @@
 ;
 ; GFX9-DL-LABEL: idot8_acc8_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s7, s0, 4
 ; GFX9-DL-NEXT:    s_lshr_b32 s14, s1, 4
@@ -2226,21 +2232,21 @@
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, v3, v4
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    s_lshr_b32 s3, s0, 20
-; GFX9-DL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 20
+; GFX9-DL-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX9-DL-NEXT:    s_lshr_b32 s10, s1, 20
 ; GFX9-DL-NEXT:    s_lshr_b32 s11, s1, 16
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s3
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v10, 12, s3
+; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v11, 12, s2
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v17, 12, s11
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v18, 12, s10
 ; GFX9-DL-NEXT:    s_lshr_b32 s5, s0, 28
 ; GFX9-DL-NEXT:    s_lshr_b32 s6, s0, 24
 ; GFX9-DL-NEXT:    s_lshr_b32 s12, s1, 28
 ; GFX9-DL-NEXT:    s_lshr_b32 s13, s1, 24
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v8, 12, s6
 ; GFX9-DL-NEXT:    v_lshlrev_b16_e64 v9, 12, s5
@@ -2262,7 +2268,7 @@
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, s2, v4
+; GFX9-DL-NEXT:    v_and_b32_e32 v4, s4, v4
 ; GFX9-DL-NEXT:    v_or_b32_e32 v6, v4, v8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -254,32 +254,32 @@
 define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc16:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_and_b32 s5, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
-; GFX7-NEXT:    s_and_b32 s4, s4, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40004
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
@@ -287,28 +287,29 @@
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc16:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -349,14 +350,15 @@
 ;
 ; GFX9-LABEL: udot8_acc16:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -397,14 +399,15 @@
 ;
 ; GFX9-DL-LABEL: udot8_acc16:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -557,32 +560,32 @@
 define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc8:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_and_b32 s5, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
-; GFX7-NEXT:    s_and_b32 s4, s4, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40004
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
@@ -590,28 +593,29 @@
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc8:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -652,14 +656,15 @@
 ;
 ; GFX9-LABEL: udot8_acc8:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -700,14 +705,15 @@
 ;
 ; GFX9-DL-LABEL: udot8_acc8:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -860,32 +866,32 @@
 define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc4:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_and_b32 s5, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
-; GFX7-NEXT:    s_and_b32 s4, s4, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40004
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
@@ -893,29 +899,30 @@
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc4:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s8, s0, 15
 ; GFX8-NEXT:    s_and_b32 s15, s1, 15
@@ -959,14 +966,15 @@
 ;
 ; GFX9-LABEL: udot8_acc4:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-NEXT:    s_and_b32 s15, s1, 15
@@ -1010,14 +1018,15 @@
 ;
 ; GFX9-DL-LABEL: udot8_acc4:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
@@ -1160,32 +1169,32 @@
 define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_CommutationInsideMAD:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_and_b32 s5, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
-; GFX7-NEXT:    s_and_b32 s4, s4, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40004
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
@@ -1193,29 +1202,30 @@
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_CommutationInsideMAD:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s8, s0, 15
 ; GFX8-NEXT:    s_and_b32 s15, s1, 15
@@ -1259,14 +1269,15 @@
 ;
 ; GFX9-LABEL: udot8_CommutationInsideMAD:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-NEXT:    s_and_b32 s15, s1, 15
@@ -1310,14 +1321,15 @@
 ;
 ; GFX9-DL-LABEL: udot8_CommutationInsideMAD:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
@@ -1976,38 +1988,38 @@
 define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc16_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x40004
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x4000c
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_and_b32 s18, s5, 15
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_and_b32 s18, s1, 15
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x4000c
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_and_b32 s11, s4, 15
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_and_b32 s11, s0, 15
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
-; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x40008
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s4, v1
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x40008
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v1, s0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -2025,22 +2037,23 @@
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc16_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 28
 ; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40018
@@ -2278,53 +2291,53 @@
 define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc8_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_bfe_u32 s6, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s13, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s17, s5, 28
+; GFX7-NEXT:    s_bfe_u32 s2, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s13, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s17, s1, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v8, s13
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40008
-; GFX7-NEXT:    s_and_b32 s16, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40004
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40008
+; GFX7-NEXT:    s_and_b32 s16, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40004
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
-; GFX7-NEXT:    s_lshr_b32 s10, s4, 28
+; GFX7-NEXT:    s_lshr_b32 s10, s0, 28
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v4, s10, v4
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v6, s8, v6
-; GFX7-NEXT:    v_mul_u32_u24_e32 v8, s6, v8
-; GFX7-NEXT:    s_bfe_u32 s5, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40008
+; GFX7-NEXT:    v_mul_u32_u24_e32 v8, s2, v8
+; GFX7-NEXT:    s_bfe_u32 s1, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40008
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
-; GFX7-NEXT:    s_and_b32 s9, s4, 15
+; GFX7-NEXT:    s_and_b32 s9, s0, 15
 ; GFX7-NEXT:    v_mov_b32_e32 v5, s16
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40018
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40014
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v2, s12, v2
-; GFX7-NEXT:    s_bfe_u32 s4, s4, 0x40010
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0x40010
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v3, s11, v3
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
 ; GFX7-NEXT:    v_mul_u32_u24_e32 v5, s9, v5
-; GFX7-NEXT:    v_mul_u32_u24_e32 v7, s7, v7
+; GFX7-NEXT:    v_mul_u32_u24_e32 v7, s3, v7
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 8, v8
 ; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX7-NEXT:    v_or_b32_e32 v4, v5, v6
 ; GFX7-NEXT:    v_or_b32_e32 v5, v7, v8
-; GFX7-NEXT:    v_mul_u32_u24_e32 v9, s4, v1
+; GFX7-NEXT:    v_mul_u32_u24_e32 v9, s0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX7-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -2342,38 +2355,39 @@
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc8_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_bfe_u32 s7, s1, 0x40004
-; GFX8-NEXT:    s_bfe_u32 s9, s1, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s14, s2, 0x40004
-; GFX8-NEXT:    s_and_b32 s15, s2, 15
-; GFX8-NEXT:    s_bfe_u32 s16, s2, 0x4000c
-; GFX8-NEXT:    s_bfe_u32 s3, s1, 0x40014
-; GFX8-NEXT:    s_lshr_b32 s5, s1, 28
-; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x40014
-; GFX8-NEXT:    s_bfe_u32 s11, s2, 0x40010
-; GFX8-NEXT:    s_lshr_b32 s12, s2, 28
-; GFX8-NEXT:    s_bfe_u32 s13, s2, 0x40018
-; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x40008
-; GFX8-NEXT:    s_and_b32 s8, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s7, s0, 0x40004
+; GFX8-NEXT:    s_bfe_u32 s9, s0, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s14, s1, 0x40004
+; GFX8-NEXT:    s_and_b32 s15, s1, 15
+; GFX8-NEXT:    s_bfe_u32 s16, s1, 0x4000c
+; GFX8-NEXT:    s_bfe_u32 s2, s0, 0x40014
+; GFX8-NEXT:    s_lshr_b32 s5, s0, 28
+; GFX8-NEXT:    s_bfe_u32 s10, s1, 0x40014
+; GFX8-NEXT:    s_bfe_u32 s11, s1, 0x40010
+; GFX8-NEXT:    s_lshr_b32 s12, s1, 28
+; GFX8-NEXT:    s_bfe_u32 s13, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x40008
+; GFX8-NEXT:    s_and_b32 s8, s0, 15
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s15
@@ -2382,27 +2396,27 @@
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v5, s8, v6
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    s_bfe_u32 s4, s1, 0x40010
-; GFX8-NEXT:    s_bfe_u32 s6, s1, 0x40018
+; GFX8-NEXT:    s_bfe_u32 s3, s0, 0x40010
+; GFX8-NEXT:    s_bfe_u32 s6, s0, 0x40018
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
-; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x40008
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0x40008
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v13, s10
-; GFX8-NEXT:    v_mov_b32_e32 v14, s3
-; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v14, s2
+; GFX8-NEXT:    v_mul_u32_u24_e32 v3, s0, v3
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_mul_u32_u24_e32 v7, s6, v9
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_u32_u24_e32 v9, s4, v12
+; GFX8-NEXT:    v_mul_u32_u24_e32 v9, s3, v12
 ; GFX8-NEXT:    v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX8-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v4, s0, v9
+; GFX8-NEXT:    v_and_b32_e32 v4, s4, v9
 ; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX8-NEXT:    v_or_b32_e32 v6, v4, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 8, v3
@@ -2421,17 +2435,18 @@
 ;
 ; GFX9-LABEL: udot8_acc8_vecMul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40010
+; GFX9-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX9-NEXT:    s_bfe_u32 s10, s1, 0x40010
 ; GFX9-NEXT:    s_bfe_u32 s11, s1, 0x40014
 ; GFX9-NEXT:    s_bfe_u32 s12, s1, 0x40018
@@ -2441,7 +2456,7 @@
 ; GFX9-NEXT:    s_bfe_u32 s16, s1, 0x40008
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-NEXT:    s_bfe_u32 s3, s0, 0x40014
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-NEXT:    s_bfe_u32 s5, s0, 0x40018
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s12
@@ -2455,8 +2470,8 @@
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s16
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v3, s2, v3
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, s3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
@@ -2466,11 +2481,11 @@
 ; GFX9-NEXT:    v_or_b32_e32 v5, v7, v8
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_and_b32_e32 v5, s2, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX9-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_e32 v6, v5, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX9-NEXT:    v_or_b32_e32 v4, v3, v4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
@@ -2487,17 +2502,18 @@
 ;
 ; GFX9-DL-LABEL: udot8_acc8_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT:    s_mov_b32 s2, 0xffff
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40010
+; GFX9-DL-NEXT:    s_bfe_u32 s2, s0, 0x40010
 ; GFX9-DL-NEXT:    s_bfe_u32 s10, s1, 0x40010
 ; GFX9-DL-NEXT:    s_bfe_u32 s11, s1, 0x40014
 ; GFX9-DL-NEXT:    s_bfe_u32 s12, s1, 0x40018
@@ -2507,7 +2523,7 @@
 ; GFX9-DL-NEXT:    s_bfe_u32 s16, s1, 0x40008
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-DL-NEXT:    s_bfe_u32 s1, s1, 0x4000c
-; GFX9-DL-NEXT:    s_bfe_u32 s4, s0, 0x40014
+; GFX9-DL-NEXT:    s_bfe_u32 s3, s0, 0x40014
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX9-DL-NEXT:    s_bfe_u32 s5, s0, 0x40018
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v5, s12
@@ -2521,8 +2537,8 @@
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v9, s16
 ; GFX9-DL-NEXT:    s_bfe_u32 s0, s0, 0x4000c
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v10, s1
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s3, v3
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v3, s2, v3
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v4, s3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, s5, v5
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, s7, v7
@@ -2532,11 +2548,11 @@
 ; GFX9-DL-NEXT:    v_or_b32_e32 v5, v7, v8
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v9, s9, v9
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, s2, v5
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, s4, v5
 ; GFX9-DL-NEXT:    v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_or_b32_e32 v6, v5, v6
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v6
-; GFX9-DL-NEXT:    v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT:    v_and_b32_e32 v3, s4, v3
 ; GFX9-DL-NEXT:    v_or_b32_e32 v4, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_add_u32_e32 v2, v5, v2
@@ -2651,32 +2667,32 @@
 define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
 ; GFX7-LABEL: udot8_acc4_vecMul:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX7-NEXT:    s_load_dword s5, s[6:7], 0x0
+; GFX7-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX7-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s6, s4, 28
-; GFX7-NEXT:    s_bfe_u32 s14, s5, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s15, s5, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s17, s5, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s18, s5, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s19, s5, 0x40004
-; GFX7-NEXT:    s_lshr_b32 s13, s5, 28
-; GFX7-NEXT:    s_and_b32 s5, s5, 15
-; GFX7-NEXT:    s_bfe_u32 s7, s4, 0x40018
-; GFX7-NEXT:    s_bfe_u32 s8, s4, 0x40014
-; GFX7-NEXT:    s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT:    s_bfe_u32 s10, s4, 0x4000c
-; GFX7-NEXT:    s_bfe_u32 s11, s4, 0x40008
-; GFX7-NEXT:    s_bfe_u32 s12, s4, 0x40004
-; GFX7-NEXT:    s_and_b32 s4, s4, 15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 28
+; GFX7-NEXT:    s_bfe_u32 s14, s1, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s15, s1, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s16, s1, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s17, s1, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s18, s1, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s19, s1, 0x40004
+; GFX7-NEXT:    s_lshr_b32 s13, s1, 28
+; GFX7-NEXT:    s_and_b32 s1, s1, 15
+; GFX7-NEXT:    s_bfe_u32 s3, s0, 0x40018
+; GFX7-NEXT:    s_bfe_u32 s8, s0, 0x40014
+; GFX7-NEXT:    s_bfe_u32 s9, s0, 0x40010
+; GFX7-NEXT:    s_bfe_u32 s10, s0, 0x4000c
+; GFX7-NEXT:    s_bfe_u32 s11, s0, 0x40008
+; GFX7-NEXT:    s_bfe_u32 s12, s0, 0x40004
+; GFX7-NEXT:    s_and_b32 s0, s0, 15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s19
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s17
@@ -2684,29 +2700,30 @@
 ; GFX7-NEXT:    v_mov_b32_e32 v6, s15
 ; GFX7-NEXT:    v_mov_b32_e32 v7, s14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u32_u24 v0, s4, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s0, v1, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s12, v2, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s11, v3, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s10, v4, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s9, v5, v0
 ; GFX7-NEXT:    v_mad_u32_u24 v0, s8, v6, v0
-; GFX7-NEXT:    v_mad_u32_u24 v0, s7, v7, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s3, v7, v0
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s13
-; GFX7-NEXT:    v_mad_u32_u24 v0, s6, v1, v0
+; GFX7-NEXT:    v_mad_u32_u24 v0, s2, v1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX7-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX7-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: udot8_acc4_vecMul:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v0, s0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX8-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s8, s0, 15
 ; GFX8-NEXT:    s_and_b32 s15, s1, 15
@@ -2750,14 +2767,15 @@
 ;
 ; GFX9-LABEL: udot8_acc4_vecMul:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-NEXT:    s_and_b32 s15, s1, 15
@@ -2801,14 +2819,15 @@
 ;
 ; GFX9-DL-LABEL: udot8_acc4_vecMul:
 ; GFX9-DL:       ; %bb.0: ; %entry
-; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-DL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-DL-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-DL-NEXT:    global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-DL-NEXT:    s_load_dword s1, s[2:3], 0x0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    s_and_b32 s8, s0, 15
 ; GFX9-DL-NEXT:    s_and_b32 s15, s1, 15
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -205,7 +205,7 @@
 ; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
 ; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
 ; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x1010101
-; GCN:     s_and_b32 s3, s1, [[K]]
+; GCN:     s_and_b32 s{{[0-9]+}}, s1, [[K]]
 ; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
 ; GCN:     s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
 ; GCN:     s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1617,36 +1617,36 @@
 define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
 ; SI-LABEL: dynamic_insertelement_v8f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; SI-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x10
-; SI-NEXT:    s_load_dword s4, s[4:5], 0x20
+; SI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x10
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x20
 ; SI-NEXT:    s_add_u32 s0, s0, s7
 ; SI-NEXT:    s_addc_u32 s1, s1, 0
 ; SI-NEXT:    v_mov_b32_e32 v16, 64
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s12
-; SI-NEXT:    s_and_b32 s4, s4, 7
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    v_mov_b32_e32 v1, s13
-; SI-NEXT:    v_mov_b32_e32 v12, s24
-; SI-NEXT:    v_mov_b32_e32 v13, s25
-; SI-NEXT:    v_mov_b32_e32 v14, s26
-; SI-NEXT:    v_mov_b32_e32 v15, s27
-; SI-NEXT:    v_mov_b32_e32 v2, s14
-; SI-NEXT:    v_mov_b32_e32 v3, s15
-; SI-NEXT:    v_mov_b32_e32 v4, s16
-; SI-NEXT:    v_mov_b32_e32 v5, s17
-; SI-NEXT:    v_mov_b32_e32 v6, s18
-; SI-NEXT:    v_mov_b32_e32 v7, s19
-; SI-NEXT:    v_mov_b32_e32 v8, s20
-; SI-NEXT:    v_mov_b32_e32 v9, s21
-; SI-NEXT:    v_mov_b32_e32 v10, s22
-; SI-NEXT:    v_mov_b32_e32 v11, s23
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_and_b32 s6, s6, 7
+; SI-NEXT:    s_lshl_b32 s6, s6, 3
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v12, s20
+; SI-NEXT:    v_mov_b32_e32 v13, s21
+; SI-NEXT:    v_mov_b32_e32 v14, s22
+; SI-NEXT:    v_mov_b32_e32 v15, s23
+; SI-NEXT:    v_or_b32_e32 v16, s6, v16
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    v_mov_b32_e32 v4, s12
+; SI-NEXT:    v_mov_b32_e32 v5, s13
+; SI-NEXT:    v_mov_b32_e32 v6, s14
+; SI-NEXT:    v_mov_b32_e32 v7, s15
+; SI-NEXT:    v_mov_b32_e32 v8, s16
+; SI-NEXT:    v_mov_b32_e32 v9, s17
+; SI-NEXT:    v_mov_b32_e32 v10, s18
+; SI-NEXT:    v_mov_b32_e32 v11, s19
 ; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
 ; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; SI-NEXT:    v_or_b32_e32 v16, s4, v16
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x40200000
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
@@ -1654,47 +1654,47 @@
 ; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
 ; SI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
 ; SI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
-; SI-NEXT:    s_mov_b32 s11, 0x100f000
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s7, 0x100f000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v8f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x40
-; VI-NEXT:    s_load_dword s4, s[4:5], 0x80
+; VI-NEXT:    s_load_dword s6, s[4:5], 0x80
+; VI-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; VI-NEXT:    s_add_u32 s0, s0, s7
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v16, 64
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s12
-; VI-NEXT:    s_and_b32 s4, s4, 7
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v1, s13
-; VI-NEXT:    v_mov_b32_e32 v12, s24
-; VI-NEXT:    v_mov_b32_e32 v13, s25
-; VI-NEXT:    v_mov_b32_e32 v14, s26
-; VI-NEXT:    v_mov_b32_e32 v15, s27
-; VI-NEXT:    v_mov_b32_e32 v2, s14
-; VI-NEXT:    v_mov_b32_e32 v3, s15
-; VI-NEXT:    v_mov_b32_e32 v4, s16
-; VI-NEXT:    v_mov_b32_e32 v5, s17
-; VI-NEXT:    v_mov_b32_e32 v6, s18
-; VI-NEXT:    v_mov_b32_e32 v7, s19
-; VI-NEXT:    v_mov_b32_e32 v8, s20
-; VI-NEXT:    v_mov_b32_e32 v9, s21
-; VI-NEXT:    v_mov_b32_e32 v10, s22
-; VI-NEXT:    v_mov_b32_e32 v11, s23
+; VI-NEXT:    s_and_b32 s6, s6, 7
+; VI-NEXT:    s_lshl_b32 s6, s6, 3
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_mov_b32_e32 v12, s20
+; VI-NEXT:    v_mov_b32_e32 v13, s21
+; VI-NEXT:    v_mov_b32_e32 v14, s22
+; VI-NEXT:    v_mov_b32_e32 v15, s23
+; VI-NEXT:    v_or_b32_e32 v16, s6, v16
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_mov_b32_e32 v4, s12
+; VI-NEXT:    v_mov_b32_e32 v5, s13
+; VI-NEXT:    v_mov_b32_e32 v6, s14
+; VI-NEXT:    v_mov_b32_e32 v7, s15
+; VI-NEXT:    v_mov_b32_e32 v8, s16
+; VI-NEXT:    v_mov_b32_e32 v9, s17
+; VI-NEXT:    v_mov_b32_e32 v10, s18
+; VI-NEXT:    v_mov_b32_e32 v11, s19
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
-; VI-NEXT:    v_or_b32_e32 v16, s4, v16
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, 0x40200000
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen
@@ -1702,13 +1702,13 @@
 ; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80
 ; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96
 ; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112
-; VI-NEXT:    s_mov_b32 s11, 0x1100f000
-; VI-NEXT:    s_mov_b32 s10, -1
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
-; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s7, 0x1100f000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -131,14 +131,14 @@
 ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
+; CI-NEXT:    s_load_dword s1, s[4:5], 0xc
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
+; CI-NEXT:    s_and_b32 s1, s1, 0xffff
 ; CI-NEXT:    s_lshl_b32 s2, s0, 16
 ; CI-NEXT:    s_or_b32 s1, s1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s1
@@ -1102,14 +1102,14 @@
 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
 ; GFX9-LABEL: s_insertelement_v2i16_dynamic:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s1, s[2:3], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-NEXT:    s_load_dword s1, s[6:7], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
 ; GFX9-NEXT:    s_lshl_b32 s0, 0xffff, s0
@@ -1120,14 +1120,14 @@
 ;
 ; VI-LABEL: s_insertelement_v2i16_dynamic:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; VI-NEXT:    s_load_dword s1, s[2:3], 0x0
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_load_dword s1, s[6:7], 0x0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b32 s0, s0, 4
 ; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
@@ -1138,14 +1138,14 @@
 ;
 ; CI-LABEL: s_insertelement_v2i16_dynamic:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
+; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x4
+; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
-; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
-; CI-NEXT:    s_load_dword s1, s[2:3], 0x0
+; CI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; CI-NEXT:    s_load_dword s1, s[6:7], 0x0
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshl_b32 s0, s0, 4
 ; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
@@ -1683,14 +1683,14 @@
 ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x10
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
 ; GFX9-NEXT:    s_mov_b32 s1, 0
@@ -1699,7 +1699,8 @@
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v4, s[0:1]
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s4, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s2, s2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v1, v5, s0, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, v4, s0, v0
@@ -1804,26 +1805,27 @@
 ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
 ; VI-NEXT:    s_mov_b32 s0, 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    s_mov_b32 s1, 0
-; VI-NEXT:    s_lshl_b32 s2, s5, 4
-; VI-NEXT:    s_and_b32 s3, s4, s0
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT:    s_lshl_b32 s2, s3, 16
-; VI-NEXT:    s_or_b32 s2, s3, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s3, s3, 4
+; VI-NEXT:    s_and_b32 s2, s2, s0
+; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
+; VI-NEXT:    s_lshl_b32 s3, s2, 16
+; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
 ; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -1832,26 +1834,27 @@
 ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; CI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x4
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
 ; CI-NEXT:    s_mov_b32 s0, 0xffff
-; CI-NEXT:    s_and_b32 s2, s4, s0
-; CI-NEXT:    s_lshl_b32 s4, s4, 16
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_or_b32 s2, s2, s4
 ; CI-NEXT:    s_mov_b32 s1, 0
-; CI-NEXT:    s_lshl_b32 s3, s5, 4
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_and_b32 s4, s2, s0
+; CI-NEXT:    s_lshl_b32 s2, s2, 16
+; CI-NEXT:    s_or_b32 s2, s4, s2
+; CI-NEXT:    s_lshl_b32 s3, s3, 4
 ; CI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s3
 ; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_mov_b32_e32 v5, s2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
 ; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -237,29 +237,29 @@
 ; SI-LABEL: maxnum_v2f16:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
 ; SI-NEXT:    s_lshr_b32 s0, s0, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_max_f32_e32 v2, v3, v2
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -260,29 +260,29 @@
 ; SI-LABEL: minnum_v2f16_ieee:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s6, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
+; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
 ; SI-NEXT:    s_lshr_b32 s0, s0, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s6
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_min_f32_e32 v2, v3, v2
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -142,15 +142,15 @@
 define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
 ; SI-LABEL: round_v2f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_movk_i32 s7, 0xfc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
 ; SI-NEXT:    s_add_i32 s14, s0, s7
+; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s14
 ; SI-NEXT:    s_brev_b32 s15, 1
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[10:11], s[0:1]
@@ -237,15 +237,15 @@
 define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 ; SI-LABEL: round_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x11
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_movk_i32 s18, 0xfc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bfe_u32 s0, s11, 0xb0014
 ; SI-NEXT:    s_add_i32 s19, s0, s18
+; SI-NEXT:    s_mov_b32 s2, s6
 ; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s19
 ; SI-NEXT:    s_brev_b32 s20, 1
 ; SI-NEXT:    s_andn2_b64 s[16:17], s[10:11], s[0:1]
@@ -342,47 +342,47 @@
 ;
 ; CI-LABEL: round_v4f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; CI-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0x11
-; CI-NEXT:    s_brev_b32 s2, -2
+; CI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
+; CI-NEXT:    s_brev_b32 s12, -2
 ; CI-NEXT:    v_mov_b32_e32 v12, 0x3ff00000
-; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
-; CI-NEXT:    v_bfi_b32 v4, s2, v12, v4
+; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[6:7]
+; CI-NEXT:    v_mov_b32_e32 v4, s7
+; CI-NEXT:    v_add_f64 v[2:3], s[6:7], -v[0:1]
+; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
-; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[8:9]
+; CI-NEXT:    v_trunc_f64_e32 v[8:9], s[4:5]
 ; CI-NEXT:    v_cndmask_b32_e32 v3, 0, v4, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, 0
 ; CI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
-; CI-NEXT:    v_add_f64 v[0:1], s[8:9], -v[8:9]
-; CI-NEXT:    v_mov_b32_e32 v4, s9
+; CI-NEXT:    v_add_f64 v[0:1], s[4:5], -v[8:9]
+; CI-NEXT:    v_mov_b32_e32 v4, s5
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5
-; CI-NEXT:    v_bfi_b32 v4, s2, v12, v4
+; CI-NEXT:    v_bfi_b32 v4, s12, v12, v4
 ; CI-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[14:15]
-; CI-NEXT:    v_mov_b32_e32 v10, s15
-; CI-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
-; CI-NEXT:    v_bfi_b32 v10, s2, v12, v10
+; CI-NEXT:    v_trunc_f64_e32 v[4:5], s[10:11]
+; CI-NEXT:    v_mov_b32_e32 v10, s11
+; CI-NEXT:    v_add_f64 v[6:7], s[10:11], -v[4:5]
+; CI-NEXT:    v_bfi_b32 v10, s12, v12, v10
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
 ; CI-NEXT:    v_mov_b32_e32 v6, 0
 ; CI-NEXT:    v_cndmask_b32_e32 v7, 0, v10, vcc
-; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[12:13]
+; CI-NEXT:    v_trunc_f64_e32 v[10:11], s[8:9]
 ; CI-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
-; CI-NEXT:    v_add_f64 v[4:5], s[12:13], -v[10:11]
-; CI-NEXT:    v_mov_b32_e32 v13, s13
+; CI-NEXT:    v_add_f64 v[4:5], s[8:9], -v[10:11]
+; CI-NEXT:    v_mov_b32_e32 v13, s9
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
-; CI-NEXT:    v_bfi_b32 v12, s2, v12, v13
+; CI-NEXT:    v_bfi_b32 v12, s12, v12, v13
 ; CI-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
 ; CI-NEXT:    v_mov_b32_e32 v4, 0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_add_f64 v[4:5], v[10:11], v[4:5]
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    v_add_f64 v[0:1], v[8:9], v[0:1]
-; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
@@ -588,12 +588,11 @@
 ;
 ; CI-LABEL: round_v8f64:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; CI-NEXT:    s_load_dwordx16 s[8:23], s[0:1], 0x19
 ; CI-NEXT:    s_brev_b32 s2, -2
 ; CI-NEXT:    v_mov_b32_e32 v16, 0x3ff00000
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
-; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
 ; CI-NEXT:    v_mov_b32_e32 v4, s11
@@ -652,6 +651,7 @@
 ; CI-NEXT:    v_trunc_f64_e32 v[16:17], s[18:19]
 ; CI-NEXT:    v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5
 ; CI-NEXT:    v_add_f64 v[14:15], s[18:19], -v[16:17]
+; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5
 ; CI-NEXT:    v_mov_b32_e32 v14, 0
 ; CI-NEXT:    v_cndmask_b32_e64 v15, 0, v18, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -10,12 +10,8 @@
 ; GCN-LABEL: {{^}}madak_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 ; GFX10-MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 ; FMA:   v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
@@ -101,12 +97,8 @@
 ; GCN-LABEL: {{^}}madak_inline_imm_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 ; GFX10-MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 ; FMA:   v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -74,20 +74,20 @@
 ; GCN-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NEXT:    v_mov_b32_e32 v6, s6
 ; GCN-NEXT:    v_mov_b32_e32 v7, s7
+; GCN-NEXT:    v_mov_b32_e32 v9, s9
+; GCN-NEXT:    v_mov_b32_e32 v10, s10
+; GCN-NEXT:    v_mov_b32_e32 v11, s11
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off offset:16
+; GCN-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off offset:32
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NEXT:    v_mov_b32_e32 v9, s9
-; GCN-NEXT:    v_mov_b32_e32 v10, s10
-; GCN-NEXT:    v_mov_b32_e32 v11, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[8:11], off offset:32
 ; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off offset:48
 ; GCN-NEXT:    s_endpgm
 bb:
@@ -118,6 +118,7 @@
 ; GCN-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:16
 ; GCN-NEXT:    buffer_load_dword v6, v0, s[0:3], 0 offen offset:20
 ; GCN-NEXT:    buffer_load_dword v7, v0, s[0:3], 0 offen offset:24
 ; GCN-NEXT:    buffer_load_dword v8, v0, s[0:3], 0 offen offset:28
@@ -132,22 +133,20 @@
 ; GCN-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
-; GCN-NEXT:    buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
+; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen offset:12
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
+; GCN-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen offset:16
 ; GCN-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen offset:20
 ; GCN-NEXT:    buffer_store_dword v7, v1, s[0:3], 0 offen offset:24
 ; GCN-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen offset:28
@@ -196,19 +195,18 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    global_load_dwordx2 v[8:9], v[0:1], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-NEXT:    v_mov_b32_e32 v10, s4
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
 ; GCN-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
-; GCN-NEXT:    v_mov_b32_e32 v9, s5
-; GCN-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GCN-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off
+; GCN-NEXT:    global_store_dwordx4 v[10:11], v[4:7], off offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -529,8 +529,8 @@
 ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
-; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 ; GCN: buffer_store_dword v[[HI]]
 define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -15,13 +15,13 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -86,6 +86,7 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@@ -95,20 +96,19 @@
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
-; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0)
   %conv = and i64 %call, 255
@@ -224,15 +224,15 @@
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
-; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
-; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
+; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -299,9 +299,9 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -454,10 +454,10 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
@@ -520,10 +520,10 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ;
 ; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
-; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX10:   global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 entry:
   %call = tail call i64 @_Z13get_global_idj(i32 0) #2
   %conv = and i64 %call, 255
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -854,16 +854,16 @@
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s0, s14
 ; GCN-NEXT:    s_mov_b32 s1, s15
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b32 s14, 0x4f800000
 ; GCN-NEXT:    s_mov_b32 s8, s12
 ; GCN-NEXT:    s_mov_b32 s9, s13
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; GCN-NEXT:    v_xor_b32_e32 v4, v4, v9
 ; GCN-NEXT:    v_xor_b32_e32 v15, v8, v9
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v4
@@ -1002,16 +1002,16 @@
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
 ; TONGA-NEXT:    s_mov_b32 s0, s14
 ; TONGA-NEXT:    s_mov_b32 s1, s15
-; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; TONGA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; TONGA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; TONGA-NEXT:    s_mov_b32 s14, 0x4f800000
 ; TONGA-NEXT:    s_mov_b32 s8, s12
 ; TONGA-NEXT:    s_mov_b32 s9, s13
 ; TONGA-NEXT:    s_waitcnt vmcnt(1)
-; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
-; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
+; TONGA-NEXT:    s_waitcnt vmcnt(0)
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; TONGA-NEXT:    v_xor_b32_e32 v4, v4, v9
 ; TONGA-NEXT:    v_xor_b32_e32 v15, v8, v9
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v4
@@ -1150,15 +1150,15 @@
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s0, s10
 ; GFX9-NEXT:    s_mov_b32 s1, s11
-; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_mov_b32 s4, 0x4f800000
 ; GFX9-NEXT:    s_mov_b32 s12, s8
 ; GFX9-NEXT:    s_mov_b32 s13, s9
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v0
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v9
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_xor_b32_e32 v4, v4, v9
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
@@ -26,76 +26,76 @@
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s15, s14
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
+; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
-; GCN-NEXT:    v_mul_hi_u32 v9, v2, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
-; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v2, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
+; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
 ; GCN-NEXT:    s_add_u32 s0, s10, s14
 ; GCN-NEXT:    s_addc_u32 s1, s11, s14
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s11, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s11, v2
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s11, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, s11, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
 ; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
@@ -497,14 +497,14 @@
 define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv24_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 40
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[8:9], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -527,14 +527,14 @@
 ;
 ; GCN-IR-LABEL: s_test_sdiv24_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 40
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 40
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -669,14 +669,14 @@
 define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv31_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 33
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[8:9], 33
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -699,14 +699,14 @@
 ;
 ; GCN-IR-LABEL: s_test_sdiv31_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 33
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 33
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 33
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -736,14 +736,14 @@
 define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv23_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 41
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[8:9], 41
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -766,14 +766,14 @@
 ;
 ; GCN-IR-LABEL: s_test_sdiv23_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 41
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 41
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 41
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -803,14 +803,14 @@
 define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv25_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, s4
-; GCN-NEXT:    s_ashr_i64 s[8:9], s[0:1], 39
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[8:9], 39
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s4
@@ -833,14 +833,14 @@
 ;
 ; GCN-IR-LABEL: s_test_sdiv25_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s9, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[0:1], 39
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[8:9], 39
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    s_ashr_i64 s[4:5], s[6:7], 39
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
--- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte:
-; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
@@ -14,7 +14,7 @@
 }
 
 ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte:
-; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) {
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x
@@ -27,7 +27,7 @@
 }
 
 ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte:
-; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013
 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) {
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
@@ -40,7 +40,7 @@
 }
 
 ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte:
-; GCN: s_load_dword [[LD:s[0-9]+]],
+; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0
 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003
 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) {
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -96,8 +96,8 @@
 define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 ; SI-LABEL: sgpr_if_else_valu_br:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xc
 ; SI-NEXT:    ; implicit-def: $sgpr6
 ; SI-NEXT:    v_cmp_lg_f32_e32 vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -478,30 +478,31 @@
 define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem23_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[4:5], 41
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 41
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 41
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s1, s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -510,30 +511,31 @@
 ;
 ; GCN-IR-LABEL: s_test_srem23_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 41
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 41
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 41
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-IR-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -549,30 +551,31 @@
 define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem24_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[4:5], 40
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s1, s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -581,30 +584,31 @@
 ;
 ; GCN-IR-LABEL: s_test_srem24_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 40
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 40
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-IR-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -674,30 +678,31 @@
 define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[4:5], 39
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 39
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 39
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s1, s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -706,30 +711,31 @@
 ;
 ; GCN-IR-LABEL: s_test_srem25_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 39
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 39
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 39
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-IR-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -745,30 +751,31 @@
 define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i64 s[8:9], s[4:5], 33
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_ashr_i64 s[6:7], s[6:7], 33
-; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 33
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_or_b32 s1, s1, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -777,30 +784,31 @@
 ;
 ; GCN-IR-LABEL: s_test_srem31_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s1, s[0:1], 0xe
+; GCN-IR-NEXT:    s_load_dword s5, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 33
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 33
-; GCN-IR-NEXT:    s_ashr_i64 s[0:1], s[0:1], 33
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s6
-; GCN-IR-NEXT:    s_xor_b32 s1, s6, s0
-; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_xor_b32 s0, s6, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 31
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -817,28 +825,28 @@
 define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s8, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s7
-; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-NEXT:    s_xor_b32 s1, s7, s0
-; GCN-NEXT:    s_ashr_i32 s1, s1, 30
-; GCN-NEXT:    s_or_b32 s1, s1, 1
+; GCN-NEXT:    s_xor_b32 s0, s7, s8
+; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-NEXT:    s_mov_b32 s1, s5
+; GCN-NEXT:    s_or_b32 s0, s0, 1
+; GCN-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -846,28 +854,28 @@
 ;
 ; GCN-IR-LABEL: s_test_srem32_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s8, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s8
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s7
-; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v0, s0
-; GCN-IR-NEXT:    s_xor_b32 s1, s7, s0
-; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 30
-; GCN-IR-NEXT:    s_or_b32 s1, s1, 1
+; GCN-IR-NEXT:    s_xor_b32 s0, s7, s8
+; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    v_mov_b32_e32 v3, s1
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
+; GCN-IR-NEXT:    s_or_b32 s0, s0, 1
+; GCN-IR-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GCN-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s8
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s7, v0
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -86,9 +86,9 @@
 ; SI-LABEL: truncate_high_elt_extract_vector:
 ; SI:       ; %bb.0: ; %bb
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; SI-NEXT:    s_load_dword s5, s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -434,18 +434,18 @@
 define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv24_64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_lshr_b32 s0, s0, 8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    s_lshr_b32 s2, s2, 8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    s_lshr_b32 s0, s7, 8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -459,18 +459,18 @@
 ;
 ; GCN-IR-LABEL: s_test_udiv24_64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
-; GCN-IR-NEXT:    s_lshr_b32 s0, s0, 8
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    s_lshr_b32 s2, s2, 8
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-IR-NEXT:    s_lshr_b32 s0, s7, 8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -534,16 +534,15 @@
 ; GCN-LABEL: s_test_udiv32_i64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_mov_b32 s0, s4
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_mov_b32 s4, s0
-; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -551,22 +550,21 @@
 ; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv32_i64:
 ; GCN-IR:       ; %bb.0:
 ; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-IR-NEXT:    s_mov_b32 s4, s0
-; GCN-IR-NEXT:    s_mov_b32 s5, s1
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -574,7 +572,7 @@
 ; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
-; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 32
   %2 = lshr i64 %y, 32
@@ -586,18 +584,18 @@
 define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv31_i64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_lshr_b32 s0, s0, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    s_lshr_b32 s2, s2, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    s_lshr_b32 s0, s7, 1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -611,18 +609,18 @@
 ;
 ; GCN-IR-LABEL: s_test_udiv31_i64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
-; GCN-IR-NEXT:    s_lshr_b32 s0, s0, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    s_lshr_b32 s2, s2, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-IR-NEXT:    s_lshr_b32 s0, s7, 1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -643,18 +641,18 @@
 define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv23_i64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_lshr_b32 s0, s0, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    s_lshr_b32 s2, s2, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    s_lshr_b32 s0, s7, 9
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    s_mov_b32 s0, s4
+; GCN-NEXT:    s_mov_b32 s1, s5
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -668,18 +666,18 @@
 ;
 ; GCN-IR-LABEL: s_test_udiv23_i64:
 ; GCN-IR:       ; %bb.0:
+; GCN-IR-NEXT:    s_load_dword s2, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dword s0, s[0:1], 0xe
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_mov_b32 s1, s5
-; GCN-IR-NEXT:    s_lshr_b32 s0, s0, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-IR-NEXT:    s_lshr_b32 s2, s2, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-IR-NEXT:    s_lshr_b32 s0, s7, 9
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s0
-; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    s_mov_b32 s0, s4
+; GCN-IR-NEXT:    s_mov_b32 s1, s5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -508,86 +508,86 @@
 define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem31_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s2, s9, 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    s_lshr_b32 s3, s3, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_lshr_b32 s1, s11, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-NEXT:    s_lshr_b32 s6, s7, 1
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    s_lshr_b32 s7, s11, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v5
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_brev_b32 s0, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem31_v2i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-IR-NEXT:    s_lshr_b32 s3, s3, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_lshr_b32 s1, s11, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-IR-NEXT:    s_lshr_b32 s6, s7, 1
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v5
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    s_brev_b32 s0, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-IR-NEXT:    s_brev_b32 s4, -2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
   %2 = lshr <2 x i64> %y, <i64 33, i64 33>
@@ -658,86 +658,86 @@
 define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
 ; GCN-LABEL: s_test_urem23_64_v2i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s2, s9, 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    s_lshr_b32 s3, s3, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-NEXT:    s_lshr_b32 s1, s11, 9
-; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s1
-; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-NEXT:    s_lshr_b32 s6, s7, 9
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-NEXT:    s_lshr_b32 s7, s11, 9
+; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v5
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_brev_b32 s0, -2
-; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-NEXT:    s_brev_b32 s4, -2
+; GCN-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem23_64_v2i64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
-; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IR-NEXT:    s_mov_b32 s6, -1
+; GCN-IR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x11
+; GCN-IR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_lshr_b32 s2, s9, 1
-; GCN-IR-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-IR-NEXT:    s_lshr_b32 s3, s3, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
-; GCN-IR-NEXT:    s_lshr_b32 s1, s11, 9
-; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v3, s1
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
-; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_mad_f32 v1, -v2, v0, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v4
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GCN-IR-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-IR-NEXT:    s_lshr_b32 s4, s9, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GCN-IR-NEXT:    s_lshr_b32 s5, s5, 1
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s5
+; GCN-IR-NEXT:    s_lshr_b32 s6, s7, 9
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 9
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v5, s7
+; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v4, s6
+; GCN-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GCN-IR-NEXT:    v_mad_f32 v2, -v3, v0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v6, v3
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v5
+; GCN-IR-NEXT:    s_mov_b32 s2, -1
+; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GCN-IR-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
-; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
-; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v4, v3
-; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
-; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s3
-; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IR-NEXT:    s_brev_b32 s0, -2
-; GCN-IR-NEXT:    v_and_b32_e32 v0, s0, v0
-; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
-; GCN-IR-NEXT:    v_and_b32_e32 v2, s0, v2
+; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GCN-IR-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GCN-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-IR-NEXT:    v_mul_lo_u32 v2, v2, s7
+; GCN-IR-NEXT:    s_brev_b32 s4, -2
+; GCN-IR-NEXT:    v_and_b32_e32 v0, s4, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-IR-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
+; GCN-IR-NEXT:    v_and_b32_e32 v2, s4, v2
+; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
   %2 = lshr <2 x i64> %y, <i64 33, i64 41>
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -147,11 +147,11 @@
 
 define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
   %tmp17 = shl i32 %index, 5
-; GFX9: buffer_load_dwordx4
+; GFX9-DAG: buffer_load_dwordx4
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
   %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
   %tmp19 = or i32 %tmp17, 16
-; GFX9: buffer_load_dwordx2
+; GFX9-DAG: buffer_load_dwordx2
   %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
   %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
   %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)