diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1624,16 +1624,32 @@
     LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                       << SUb->NodeNum << ")\n");
 
-    // Copy successor edges from SUa to SUb. Interleaving computation
-    // dependent on SUa can prevent load combining due to register reuse.
-    // Predecessor edges do not need to be copied from SUb to SUa since
-    // nearby loads should have effectively the same inputs.
-    for (const SDep &Succ : SUa->Succs) {
-      if (Succ.getSUnit() == SUb)
-        continue;
-      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
-                        << ")\n");
-      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    if (IsLoad) {
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since
+      // nearby loads should have effectively the same inputs.
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SUb)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+    } else {
+      // Copy predecessor edges from SUb to SUa to avoid the SUnits that
+      // SUb dependent on scheduled in-between SUb and SUa. Successor edges
+      // do not need to be copied from SUa to SUb since no one will depend
+      // on stores.
+      // Notice that, we don't need to care about the memory dependency as
+      // we won't try to cluster them if they have any memory dependency.
+      for (const SDep &Pred : SUb->Preds) {
+        if (Pred.getSUnit() == SUa)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Copy Pred SU(" << Pred.getSUnit()->NodeNum
+                          << ")\n");
+        DAG->addEdge(SUa, SDep(Pred.getSUnit(), SDep::Artificial));
+      }
     }
 
     LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -194,3 +194,22 @@
   store i64 %add6.3, i64* %arrayidx5.3, align 8
   ret void
 }
+
+; Verify that the SU(2) and SU(4) are the preds of SU(3)
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_missing_preds_edges:%bb.0
+; CHECK:Cluster ld/st SU(3) - SU(5)
+; CHECK: Copy Pred SU(4)
+; CHECK: Copy Pred SU(2)
+; CHECK:SU(2):   %0:gpr64common = COPY $x0
+; CHECK:SU(3):   STRWui %1:gpr32, %0:gpr64common, 0
+; CHECK:SU(4):   %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0
+; CHECK:SU(5):   STRWui %3:gpr32common, %0:gpr64common, 1
+define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) {
+entry:
+  store i32 %m, i32* %p, align 4
+  %add = add nsw i32 %n, 5
+  %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
+  store i32 %add, i32* %arrayidx1, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -744,13 +744,13 @@
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12
 
 ; GCN: s_getpc_b64
 
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -624,11 +624,10 @@
 
 
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
-
 ; FIXEDABI: s_movk_i32 s32, 0x400{{$}}
-
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}}
+
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 
 ; FIXME: Why this reload?
@@ -670,9 +669,8 @@
 
 ; FIXED-ABI-NOT: v31
 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
-; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
-
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}}
 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
 ; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
 
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1368,11 +1368,11 @@
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
-; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
 ; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
 ; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
+; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -307,6 +307,7 @@
 ; GCN: flat_load_dwordx4
 ; GCN: flat_load_dwordx4
 
+; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
@@ -315,7 +316,6 @@
 
 ; GCN: flat_store_dwordx4
 
-; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
 ; SI: v_cvt_f32_f16_e32
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -160,16 +160,16 @@
 ; GCN-NEXT: s_mov_b64 exec, s[4:5]
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
 ; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_mov_b32 s34, s32
 ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
-
-; GCN: s_mov_b32 s34, s32
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-
-; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN-NEXT: s_add_u32 s32, s32, 0x30000
 
+; GCN: v_mov_b32_e32 v33, 0
+
+; GCN: buffer_store_dword v33, off, s[0:3], s33 offset:1024
+
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]