diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1580,34 +1580,47 @@
     return;
 
   llvm::sort(MemOpRecords);
-  unsigned ClusterLength = 1;
+
+  unsigned ClusterLength = 2;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
-    SUnit *SUa = MemOpRecords[Idx].SU;
-    SUnit *SUb = MemOpRecords[Idx+1].SU;
-    if (TII->shouldClusterMemOps(MemOpRecords[Idx].BaseOps,
-                                 MemOpRecords[Idx + 1].BaseOps,
-                                 ClusterLength + 1)) {
-      if (SUa->NodeNum > SUb->NodeNum)
-        std::swap(SUa, SUb);
-      if (DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-        LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
-                          << SUb->NodeNum << ")\n");
-        // Copy successor edges from SUa to SUb. Interleaving computation
-        // dependent on SUa can prevent load combining due to register reuse.
-        // Predecessor edges do not need to be copied from SUb to SUa since
-        // nearby loads should have effectively the same inputs.
-        for (const SDep &Succ : SUa->Succs) {
-          if (Succ.getSUnit() == SUb)
-            continue;
-          LLVM_DEBUG(dbgs()
-                     << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum << ")\n");
-          DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
-        }
-        ++ClusterLength;
-      } else
-        ClusterLength = 1;
-    } else
-      ClusterLength = 1;
+    // Decision to cluster mem ops is taken based on target dependent logic
+    auto MemOpa = MemOpRecords[Idx];
+    auto MemOpb = MemOpRecords[Idx + 1];
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps,
+                                  ClusterLength)) {
+      // Current mem ops pair could not be clustered, reset cluster length, and
+      // go to next pair
+      ClusterLength = 2;
+      continue;
+    }
+
+    SUnit *SUa = MemOpa.SU;
+    SUnit *SUb = MemOpb.SU;
+    if (SUa->NodeNum > SUb->NodeNum)
+      std::swap(SUa, SUb);
+
+    // FIXME: Is this check really required?
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+      ClusterLength = 2;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+                      << SUb->NodeNum << ")\n");
+
+    // Copy successor edges from SUa to SUb. Interleaving computation
+    // dependent on SUa can prevent load combining due to register reuse.
+    // Predecessor edges do not need to be copied from SUb to SUa since
+    // nearby loads should have effectively the same inputs.
+    for (const SDep &Succ : SUa->Succs) {
+      if (Succ.getSUnit() == SUb)
+        continue;
+      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                        << ")\n");
+      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    }
+
+    ++ClusterLength;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -61,6 +61,7 @@
   };
 
   using SetVectorType = SmallSetVector<MachineInstr *, 32>;
+  mutable unsigned NumClusteredBytes = 0;
 
   static unsigned getBranchOpcode(BranchPredicate Cond);
   static BranchPredicate getBranchPredicate(unsigned Opcode);
@@ -134,6 +135,11 @@
 
   Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
+  void computeNumClusteredBytes(const MachineInstr &FirstLdSt,
+                                const MachineOperand *FirstDst,
+                                const MachineOperand *SecondDst,
+                                const unsigned NumLoads) const;
+
 protected:
   bool swapSourceModifiers(MachineInstr &MI,
                            MachineOperand &Src0, unsigned Src0OpName,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -428,6 +428,33 @@
   return Base1 == Base2;
 }
 
+void SIInstrInfo::computeNumClusteredBytes(const MachineInstr &FirstLdSt,
+                                           const MachineOperand *FirstDst,
+                                           const MachineOperand *SecondDst,
+                                           const unsigned NumLoads) const {
+  const MachineRegisterInfo &MRI =
+      FirstLdSt.getParent()->getParent()->getRegInfo();
+
+  const Register FirstDstReg = FirstDst->getReg();
+  const Register SecondDstReg = SecondDst->getReg();
+
+  const TargetRegisterClass *FirstDstRC =
+      Register::isVirtualRegister(FirstDstReg)
+          ? MRI.getRegClass(FirstDstReg)
+          : RI.getPhysRegClass(FirstDstReg);
+  const TargetRegisterClass *SecondDstRC =
+      Register::isVirtualRegister(SecondDstReg)
+          ? MRI.getRegClass(SecondDstReg)
+          : RI.getPhysRegClass(SecondDstReg);
+
+  const unsigned FirstNumDestBytes = RI.getRegSizeInBits(*FirstDstRC) / 8;
+  const unsigned SecondNumDestBytes = RI.getRegSizeInBits(*SecondDstRC) / 8;
+
+  NumClusteredBytes = (NumLoads == 2)
+                          ? (FirstNumDestBytes + SecondNumDestBytes)
+                          : (NumClusteredBytes + SecondNumDestBytes);
+}
+
 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
                                       ArrayRef<const MachineOperand *> BaseOps2,
                                       unsigned NumLoads) const {
@@ -465,30 +492,11 @@
   if (!FirstDst || !SecondDst)
     return false;
 
-  // Try to limit clustering based on the total number of bytes loaded
-  // rather than the number of instructions.  This is done to help reduce
-  // register pressure.  The method used is somewhat inexact, though,
-  // because it assumes that all loads in the cluster will load the
-  // same number of bytes as FirstLdSt.
-
-  // The unit of this value is bytes.
-  // FIXME: This needs finer tuning.
-  unsigned LoadClusterThreshold = 16;
-
-  const MachineRegisterInfo &MRI =
-      FirstLdSt.getParent()->getParent()->getRegInfo();
-
-  const Register Reg = FirstDst->getReg();
-
-  const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
-                                         ? MRI.getRegClass(Reg)
-                                         : RI.getPhysRegClass(Reg);
-
-  // FIXME: NumLoads should not be subtracted 1. This is to match behavior
-  // of clusterNeighboringMemOps which was previosly passing cluster length
-  // less 1. LoadClusterThreshold should be tuned instead.
-  return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
-         LoadClusterThreshold;
+  // Try to limit clustering based on the total number of clustred bytes loaded
+  // rather than the number of clustered instructions.
+  unsigned MaxNumClusteredBytes = 16;
+  computeNumClusteredBytes(FirstLdSt, FirstDst, SecondDst, NumLoads);
+  return (NumClusteredBytes <= MaxNumClusteredBytes);
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,