Index: llvm/lib/CodeGen/MachineScheduler.cpp
===================================================================
--- llvm/lib/CodeGen/MachineScheduler.cpp
+++ llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1579,6 +1579,31 @@
 
   llvm::sort(MemOpRecords);
   unsigned ClusterLength = 1;
+  SmallVector<SUnit *, 32> ClusteredChain;
+
+  // Copy successor edges from all SU in a cluster to the last SU. The last
+  // SUnit becomes a sentinel for the whole cluster.
+  // Interleaving computation dependent on a SU inside the cluster can prevent
+  // load combining due to register reuse. Predecessor edges do not need to be
+  // copied from the last SU to all ofther nodes since nearby loads should have
+  // effectively the same inputs.
+  auto transferSuccessors = [DAG, &ClusteredChain]()->void {
+    if (ClusteredChain.empty())
+      return;
+
+    SUnit *SU = ClusteredChain.pop_back_val();
+    while (!ClusteredChain.empty()) {
+      SUnit *SUa = ClusteredChain.pop_back_val();
+      for (const SDep &Succ : SUa->Succs) {
+        if (Succ.getSUnit() == SU)
+          continue;
+        LLVM_DEBUG(dbgs()
+                   << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(Succ.getSUnit(), SDep(SU, SDep::Artificial));
+      }
+    }
+  };
+
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
     SUnit *SUa = MemOpRecords[Idx].SU;
     SUnit *SUb = MemOpRecords[Idx+1].SU;
@@ -1587,26 +1612,24 @@
                                  ClusterLength + 1)) {
       if (SUa->NodeNum > SUb->NodeNum)
         std::swap(SUa, SUb);
+      if (ClusterLength == 1)
+        ClusteredChain.push_back(SUa);
+      ClusteredChain.push_back(SUb);
       if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
         LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
                           << SUb->NodeNum << ")\n");
-        // Copy successor edges from SUa to SUb. Interleaving computation
-        // dependent on SUa can prevent load combining due to register reuse.
-        // Predecessor edges do not need to be copied from SUb to SUa since
-        // nearby loads should have effectively the same inputs.
-        for (const SDep &Succ : SUa->Succs) {
-          if (Succ.getSUnit() == SUb)
-            continue;
-          LLVM_DEBUG(dbgs()
-                     << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
-          DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
-        }
         ++ClusterLength;
-      } else
+      } else {
+        transferSuccessors();
         ClusterLength = 1;
-    } else
+      }
+    } else {
+      transferSuccessors();
       ClusterLength = 1;
+    }
   }
+
+  transferSuccessors();
 }
 
 /// Callback from DAG postProcessing to create cluster edges for loads.
Index: llvm/test/CodeGen/AMDGPU/cluster_stores.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s 2>&1 | FileCheck --enable-var-scope --check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}cluster_load_cluster_store:
+; GCN:      flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
+; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
+; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
+; GCN:      flat_store_dword v[{{[0-9:]+}}], [[LD1]]
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
+define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+bb:
+  %la0 = getelementptr inbounds i32, i32* %lb, i32 0
+  %ld0 = load i32, i32* %la0
+  %la1 = getelementptr inbounds i32, i32* %lb, i32 2
+  %ld1 = load i32, i32* %la1
+  %la2 = getelementptr inbounds i32, i32* %lb, i32 4
+  %ld2 = load i32, i32* %la2
+  %la3 = getelementptr inbounds i32, i32* %lb, i32 6
+  %ld3 = load i32, i32* %la3
+  %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
+
+  store i32 %ld0, i32* %sa0
+  %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
+  store i32 %ld1, i32* %sa1
+  %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
+  store i32 %ld2, i32* %sa2
+  %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
+  store i32 %ld3, i32* %sa3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}cluster_load_valu_cluster_store:
+; GCN:      flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
+; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
+; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
+; GCN:      v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
+; GCN:      flat_store_dword v[{{[0-9:]+}}], [[LD1]]
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
+define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+bb:
+  %la0 = getelementptr inbounds i32, i32* %lb, i32 0
+  %ld0 = load i32, i32* %la0
+  %la1 = getelementptr inbounds i32, i32* %lb, i32 2
+  %ld1 = load i32, i32* %la1
+  %la2 = getelementptr inbounds i32, i32* %lb, i32 4
+  %ld2 = load i32, i32* %la2
+  %la3 = getelementptr inbounds i32, i32* %lb, i32 6
+  %ld3 = load i32, i32* %la3
+  %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
+
+  store i32 %ld0, i32* %sa0
+  %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
+
+  %add = add i32 %ld1, 1
+  store i32 %add, i32* %sa1
+  %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
+  store i32 %ld2, i32* %sa2
+  %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
+  store i32 %ld3, i32* %sa3
+
+  ret void
+}
Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -86,9 +86,9 @@
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ; GFX8:    flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
 ;
-; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
+; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
@@ -225,11 +225,11 @@
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
-; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
-; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
 ; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
+; GFX9:    global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
 ;
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
 ; GFX10:   global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024