Index: llvm/lib/CodeGen/MachineScheduler.cpp
===================================================================
--- llvm/lib/CodeGen/MachineScheduler.cpp
+++ llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1579,16 +1579,30 @@
         DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
       LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                         << SUb->NodeNum << ")\n");
-      // Copy successor edges from SUa to SUb. Interleaving computation
-      // dependent on SUa can prevent load combining due to register reuse.
-      // Predecessor edges do not need to be copied from SUb to SUa since nearby
-      // loads should have effectively the same inputs.
-      for (const SDep &Succ : SUa->Succs) {
-        if (Succ.getSUnit() == SUb)
-          continue;
-        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
-                          << ")\n");
-        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+      if (IsLoad) {
+        // Copy successor edges from SUa to SUb. Interleaving computation
+        // dependent on SUa can prevent load combining due to register reuse.
+        // Predecessor edges do not need to be copied from SUb to SUa since
+        // nearby loads should have effectively the same inputs.
+        for (const SDep &Succ : SUa->Succs) {
+          if (Succ.getSUnit() == SUb)
+            continue;
+          LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                            << ")\n");
+          DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+        }
+      } else {
+        // Copy predecessor edges from SUb to SUa to avoid the SUnits that
+        // SUb dependent on scheduled in-between SUb and SUa. Successor edges
+        // do not need to be copied from SUa to SUb since no one will depend
+        // on stores.
+        for (const SDep &Pred : SUb->Preds) {
+          if (Pred.getSUnit() == SUa)
+            continue;
+          LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
+                            << ")\n");
+          DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
+        }
       }
       ++ClusterLength;
     } else
Index: llvm/test/CodeGen/AArch64/macro-fusion.ll
===================================================================
--- llvm/test/CodeGen/AArch64/macro-fusion.ll
+++ llvm/test/CodeGen/AArch64/macro-fusion.ll
@@ -18,3 +18,22 @@
   %sub = sub nsw i32 %add1, %d
   ret i32 %sub
 }
+
+; Verify that, the load/store cluster creates the necessary dependencies
+; between two fused stores.
+define void @test_cluster(i32* %p, i32 %m, i32 %n) {
+entry:
+; CHECK: ********** MI Scheduling **********
+; CEHCK-LABEL: test_cluster:%bb.0 entry
+; CHECK: Cluster ld/st SU([[SU3:[0-9]+]]) - SU([[SU4:[0-9]+]])
+; CHECK:  Copy Pred SU([[SU2:[0-9]+]])
+; CHECK:  Copy Pred SU([[SU0:[0-9]+]])
+; CHECK: SU([[SU0]]):   %{{[0-9]+}}:gpr32 = COPY $w2
+; CHECK: SU([[SU2]]):   %{{[0-9]+}}:gpr64common = COPY $x0
+; CHECK: SU([[SU3]]):   STRWui 
+; CHECK: SU([[SU4]]):   STRWui 
+  store i32 %m, i32* %p, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
+  store i32 %n, i32* %arrayidx1, align 4
+  ret void
+}
Index: llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -133,10 +133,10 @@
 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
 
 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
@@ -331,12 +331,12 @@
 ; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
 ; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
 ; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24
 ; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN: s_waitcnt vmcnt(3)
 ; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
 ; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
 ; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
Index: llvm/test/CodeGen/AMDGPU/max.i16.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -134,25 +134,25 @@
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v7, s5
-; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; VI-NEXT:    flat_load_dword v8, v[0:1]
-; VI-NEXT:    flat_load_ushort v9, v[4:5]
+; VI-NEXT:    flat_load_dword v7, v[0:1]
+; VI-NEXT:    flat_load_ushort v4, v[4:5]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; VI-NEXT:    flat_load_dword v2, v[2:3]
-; VI-NEXT:    flat_load_ushort v0, v[0:1]
-; VI-NEXT:    v_add_u32_e32 v4, vcc, 4, v6
-; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; VI-NEXT:    flat_load_dword v5, v[2:3]
+; VI-NEXT:    flat_load_ushort v8, v[0:1]
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v6
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; VI-NEXT:    v_max_i16_e32 v1, v8, v2
-; VI-NEXT:    v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-NEXT:    v_or_b32_e32 v1, v1, v2
+; VI-NEXT:    v_max_i16_e32 v6, v7, v5
+; VI-NEXT:    v_max_i16_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v5, v6, v5
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_max_i16_e32 v0, v9, v0
-; VI-NEXT:    flat_store_dword v[6:7], v1
-; VI-NEXT:    flat_store_short v[4:5], v0
+; VI-NEXT:    v_max_i16_e32 v4, v4, v8
+; VI-NEXT:    flat_store_dword v[0:1], v5
+; VI-NEXT:    flat_store_short v[2:3], v4
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: v_test_imax_sge_v3i16:
Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -94,7 +94,7 @@
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
 ;
Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -14,6 +14,7 @@
 ;
 ; GCN-LABEL: {{^}}ps_main:
 
+; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
 ; GCN-DAG: s_mov_b32 s6, -1
@@ -23,14 +24,13 @@
 ; GFX10_W32-DAG: s_mov_b32 s7, 0x31c16000
 ; GFX10_W64-DAG: s_mov_b32 s7, 0x31e16000
 ; GCN-NOT: s_mov_b32 s0
-; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
 ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
 ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
 
-; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
+; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx