diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -1525,7 +1526,9 @@
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
 
 protected:
-  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG);
+  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps,
+                                ScheduleDAGInstrs *DAG,
+                                SmallBitVector &ClusteredUnits);
 };
 
 class StoreClusterMutation : public BaseMemOpClusterMutation {
@@ -1562,9 +1565,13 @@
 } // end namespace llvm
 
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
-    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
+    ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG,
+    SmallBitVector &ClusteredUnits) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   for (SUnit *SU : MemOps) {
+    // Skip those already clustered
+    if (ClusteredUnits.test(SU->NodeNum))
+      continue;
     SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
     if (TII->getMemOperandsWithOffset(*SU->getInstr(), BaseOps, Offset, TRI))
@@ -1602,6 +1609,8 @@
           DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
         }
         ++ClusterLength;
+        ClusteredUnits.set(SUa->NodeNum);
+        ClusteredUnits.set(SUb->NodeNum);
       } else
         ClusterLength = 1;
     } else
@@ -1630,9 +1639,16 @@
     Chain.push_back(&SU);
   }
 
-  // Iterate over the store chains.
-  for (auto &SCD : StoreChains)
-    clusterNeighboringMemOps(SCD.second, DAG);
+  // Iterate over the store chains. Each time, insert units without any ctrl
+  // preds into other groups.
+  SmallBitVector ClusteredUnits(DAG->SUnits.size());
+  const auto &Free = StoreChains.FindAndConstruct(DAG->SUnits.size()).second;
+  for (auto &SCD : StoreChains) {
+    if (SCD.first != DAG->SUnits.size())
+      for (SUnit *S : Free)
+        SCD.second.push_back(S);
+    clusterNeighboringMemOps(SCD.second, DAG, ClusteredUnits);
+  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
--- a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -3,15 +3,17 @@
 
 define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %dct_table, i16* nocapture readonly %coef_block, i8** nocapture readonly %output_buf, i32 %output_col) local_unnamed_addr #0 {
 ; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Ljsimd_idct_ifast_neon_intrinsic$local:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ldr q0, [x1, #32]
-; CHECK-NEXT:    ldr q1, [x1, #96]
-; CHECK-NEXT:    ldr q2, [x0, #32]
+; CHECK-NEXT:    ldr q1, [x0, #32]
+; CHECK-NEXT:    ldr q2, [x1, #96]
 ; CHECK-NEXT:    ldr q3, [x0, #96]
 ; CHECK-NEXT:    ldr x8, [x2, #48]
+; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mul v0.8h, v2.8h, v0.8h
-; CHECK-NEXT:    mul v1.8h, v3.8h, v1.8h
+; CHECK-NEXT:    mul v1.8h, v3.8h, v2.8h
 ; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    str q2, [x8, x9]
 ; CHECK-NEXT:    ldr x8, [x2, #56]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -1595,19 +1595,19 @@
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x10
-; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    v_mov_b32_e32 v4, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    ds_inc_rtn_u32 v4, v1, v0
-; GFX9-NEXT:    ds_inc_rtn_u32 v5, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    ds_inc_rtn_u32 v6, v5, v4
+; GFX9-NEXT:    ds_inc_rtn_u32 v4, v5, v4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX9-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v5, off
+; GFX9-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX9-NEXT:    s_endpgm
   %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -7,7 +7,6 @@
 ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v8f32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
@@ -23,7 +22,6 @@
 ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v8f32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
@@ -39,7 +37,6 @@
 ; GCN-LABEL: {{^}}store_bitcast_constant_v4i64_to_v4f64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
@@ -55,7 +52,6 @@
 ; GCN-LABEL: {{^}}store_bitcast_constant_v8i32_to_v16i16:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -491,12 +491,12 @@
 ; GCN: enable_vgpr_workitem_id = 0
 ; GCN-DAG: s_mov_b32 s33, s7
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; GCN: s_add_u32 s32, s33, 0x400{{$}}
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
-; GCN: s_add_u32 s32, s33, 0x400{{$}}
 
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 
 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
@@ -520,8 +520,8 @@
 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
 ; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -51,8 +51,8 @@
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32 addrspace(5)*, addrspace(5)
@@ -66,9 +66,9 @@
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
+
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -364,13 +364,13 @@
 ; GCN-LABEL: chain_hi_to_lo_group_may_alias_store:
 ; GCN:       ; %bb.0: ; %bb
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x7b
-; GCN-NEXT:    ds_read_u16 v2, v0
-; GCN-NEXT:    ds_write_b16 v1, v3
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GCN-NEXT:    ds_read_u16 v3, v0
+; GCN-NEXT:    ds_write_b16 v1, v2
 ; GCN-NEXT:    ds_read_u16 v0, v0 offset:2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GCN-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GCN-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, i16 addrspace(3)* %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -98,8 +98,8 @@
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN-NOT: {{s|flat|buffer|global}}_load
 ; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
-; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
+; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[V_ELT2]]
 ; GCN: buffer_store_byte [[V_LOAD0]]
 define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -20,22 +20,19 @@
 ; GFX7-UNALIGNED-LABEL: global_load_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX7-UNALIGNED-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX7-UNALIGNED-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX7-UNALIGNED-NEXT:    flat_load_ushort v1, v[2:3]
+; GFX7-UNALIGNED-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-UNALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: global_load_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v[0:1], off
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off offset:2
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-NEXT:    v_bfi_b32 v1, v1, 0, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
   %p.0 = load i16, i16 addrspace(1)* %p, align 2
@@ -52,45 +49,37 @@
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, 1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v5, 2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v2
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-ALIGNED-NEXT:    flat_store_short v[0:1], v4
+; GFX7-ALIGNED-NEXT:    flat_store_short v[2:3], v5
 ; GFX7-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 1
+; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-UNALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-UNALIGNED-NEXT:    flat_store_short v[0:1], v2
-; GFX7-UNALIGNED-NEXT:    s_addc_u32 s3, s1, 0
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 2
-; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-UNALIGNED-NEXT:    flat_store_short v[0:1], v2
+; GFX7-UNALIGNED-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-UNALIGNED-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: global_store_2xi16_align2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
-; GFX9-NEXT:    v_mov_b32_e32 v2, 1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
-; GFX9-NEXT:    global_store_short v[0:1], v3, off offset:2
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
   %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
   store i16 1, i16 addrspace(1)* %r, align 2
@@ -155,8 +144,9 @@
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, 1
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v5, 0
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v8, 1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v9, 0
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v10, 2
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s3, s1, 0
@@ -165,18 +155,17 @@
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
+; GFX7-ALIGNED-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v7, s3
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, s5
-; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v4
-; GFX7-ALIGNED-NEXT:    flat_store_byte v[2:3], v5
-; GFX7-ALIGNED-NEXT:    s_addc_u32 s1, s1, 0
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, 2
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, s3
-; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v5
-; GFX7-ALIGNED-NEXT:    flat_store_byte v[2:3], v4
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v6, s2
+; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v8
+; GFX7-ALIGNED-NEXT:    flat_store_byte v[2:3], v9
+; GFX7-ALIGNED-NEXT:    flat_store_byte v[4:5], v9
+; GFX7-ALIGNED-NEXT:    flat_store_byte v[6:7], v10
 ; GFX7-ALIGNED-NEXT:    s_endpgm
 ;
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
--- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll
@@ -45,8 +45,8 @@
   store volatile i64 %add7, i64 addrspace(1)* %ptr9
 
 ; Test various offset boundaries.
-; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
 ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
 ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
   %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
   %load11 = load i64, i64 addrspace(1)* %gep11
diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
--- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir
@@ -35,12 +35,12 @@
   ; CHECK:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
   ; CHECK:   undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
   ; CHECK:   dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
   ; CHECK:   [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
+  ; CHECK:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK:   undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
   ; CHECK:   %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -341,16 +341,15 @@
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bfe_i32 s3, s0, 0x80008
 ; SI-NEXT:    s_ashr_i32 s1, s0, 24
 ; SI-NEXT:    s_bfe_i32 s2, s0, 0x80010
-; SI-NEXT:    s_bfe_i32 s3, s0, 0x80008
 ; SI-NEXT:    s_sext_i32_i8 s0, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -469,13 +468,12 @@
 ; SI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 48
 ; SI-NEXT:    s_ashr_i32 s5, s6, 16
 ; SI-NEXT:    s_sext_i32_i16 s6, s6
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v0, s5
 ; SI-NEXT:    s_sext_i32_i16 s7, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(1)
 ; SI-NEXT:    v_mov_b32_e32 v0, s7
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
@@ -493,12 +491,12 @@
 ; VI-NEXT:    s_ashr_i32 s5, s6, 16
 ; VI-NEXT:    s_sext_i32_i16 s6, s6
 ; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s6
 ; VI-NEXT:    s_ashr_i32 s4, s7, 16
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT:    v_mov_b32_e32 v0, s5
 ; VI-NEXT:    s_sext_i32_i16 s7, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s7
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4