Index: include/llvm/Target/TargetInstrInfo.h
===================================================================
--- include/llvm/Target/TargetInstrInfo.h
+++ include/llvm/Target/TargetInstrInfo.h
@@ -997,9 +997,11 @@
 
   virtual bool enableClusterLoads() const { return false; }
 
-  virtual bool shouldClusterLoads(MachineInstr *FirstLdSt,
-                                  MachineInstr *SecondLdSt,
-                                  unsigned NumLoads) const {
+  virtual bool enableClusterStores() const { return false; }
+
+  virtual bool shouldClusterMemOps(MachineInstr *FirstLdSt,
+                                   MachineInstr *SecondLdSt,
+                                   unsigned NumLoads) const {
     return false;
   }
 
Index: lib/CodeGen/MachineScheduler.cpp
===================================================================
--- lib/CodeGen/MachineScheduler.cpp
+++ lib/CodeGen/MachineScheduler.cpp
@@ -71,8 +71,9 @@
 static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
   cl::desc("Enable cyclic critical path analysis."), cl::init(true));
 
-static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
-  cl::desc("Enable load clustering."), cl::init(true));
+static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
+                                        cl::desc("Enable memop clustering."),
+                                        cl::init(true));
 
 // Experimental heuristics
 static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
@@ -1351,64 +1352,80 @@
 }
 
 //===----------------------------------------------------------------------===//
-// LoadClusterMutation - DAG post-processing to cluster loads.
+// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
 //===----------------------------------------------------------------------===//
 
 namespace {
 /// \brief Post-process the DAG to create cluster edges between neighboring
-/// loads.
-class LoadClusterMutation : public ScheduleDAGMutation {
-  struct LoadInfo {
+/// loads or between neighboring stores.
+class BaseMemOpClusterMutation : public ScheduleDAGMutation {
+  struct MemOpInfo {
     SUnit *SU;
     unsigned BaseReg;
     int64_t Offset;
-    LoadInfo(SUnit *su, unsigned reg, int64_t ofs)
-      : SU(su), BaseReg(reg), Offset(ofs) {}
+    MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
+        : SU(su), BaseReg(reg), Offset(ofs) {}
 
-    bool operator<(const LoadInfo &RHS) const {
+    bool operator<(const MemOpInfo&RHS) const {
       return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset);
     }
   };
 
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  bool IsLoad;
+
 public:
-  LoadClusterMutation(const TargetInstrInfo *tii,
-                      const TargetRegisterInfo *tri)
-    : TII(tii), TRI(tri) {}
+  BaseMemOpClusterMutation(const TargetInstrInfo *tii,
+                           const TargetRegisterInfo *tri, bool IsLoad)
+      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
 
   void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
 protected:
-  void clusterNeighboringLoads(ArrayRef<SUnit*> Loads, ScheduleDAGMI *DAG);
+  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG);
+};
+
+class StoreClusterMutation : public BaseMemOpClusterMutation {
+public:
+  StoreClusterMutation(const TargetInstrInfo *tii,
+                       const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, false) {}
+};
+
+class LoadClusterMutation : public BaseMemOpClusterMutation {
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, true) {}
 };
 } // anonymous
 
-void LoadClusterMutation::clusterNeighboringLoads(ArrayRef<SUnit*> Loads,
-                                                  ScheduleDAGMI *DAG) {
-  SmallVector<LoadClusterMutation::LoadInfo,32> LoadRecords;
-  for (unsigned Idx = 0, End = Loads.size(); Idx != End; ++Idx) {
-    SUnit *SU = Loads[Idx];
+void BaseMemOpClusterMutation::clusterNeighboringMemOps(
+    ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) {
+    SUnit *SU = MemOps[Idx];
     unsigned BaseReg;
     int64_t Offset;
     if (TII->getMemOpBaseRegImmOfs(SU->getInstr(), BaseReg, Offset, TRI))
-      LoadRecords.push_back(LoadInfo(SU, BaseReg, Offset));
+      MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset));
   }
-  if (LoadRecords.size() < 2)
+  if (MemOpRecords.size() < 2)
     return;
-  std::sort(LoadRecords.begin(), LoadRecords.end());
+
+  std::sort(MemOpRecords.begin(), MemOpRecords.end());
   unsigned ClusterLength = 1;
-  for (unsigned Idx = 0, End = LoadRecords.size(); Idx < (End - 1); ++Idx) {
-    if (LoadRecords[Idx].BaseReg != LoadRecords[Idx+1].BaseReg) {
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {
       ClusterLength = 1;
       continue;
     }
 
-    SUnit *SUa = LoadRecords[Idx].SU;
-    SUnit *SUb = LoadRecords[Idx+1].SU;
-    if (TII->shouldClusterLoads(SUa->getInstr(), SUb->getInstr(), ClusterLength)
+    SUnit *SUa = MemOpRecords[Idx].SU;
+    SUnit *SUb = MemOpRecords[Idx+1].SU;
+    if (TII->shouldClusterMemOps(SUa->getInstr(), SUb->getInstr(), ClusterLength)
         && DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-
-      DEBUG(dbgs() << "Cluster loads SU(" << SUa->NodeNum << ") - SU("
+      DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
             << SUb->NodeNum << ")\n");
       // Copy successor edges from SUa to SUb. Interleaving computation
       // dependent on SUa can prevent load combining due to register reuse.
@@ -1429,17 +1446,20 @@
 }
 
 /// \brief Callback from DAG postProcessing to create cluster edges for loads.
-void LoadClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+
   ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
 
   // Map DAG NodeNum to store chain ID.
   DenseMap<unsigned, unsigned> StoreChainIDs;
-  // Map each store chain to a set of dependent loads.
+  // Map each store chain to a set of dependent MemOps.
   SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
   for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
     SUnit *SU = &DAG->SUnits[Idx];
-    if (!SU->getInstr()->mayLoad())
+    if ((IsLoad && !SU->getInstr()->mayLoad()) ||
+        (!IsLoad && !SU->getInstr()->mayStore()))
       continue;
+
     unsigned ChainPredID = DAG->SUnits.size();
     for (SUnit::const_pred_iterator
            PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
@@ -1449,7 +1469,7 @@
       }
     }
     // Check if this chain-like pred has been seen
-    // before. ChainPredID==MaxNodeID for loads at the top of the schedule.
+    // before. ChainPredID==MaxNodeID at the top of the schedule.
     unsigned NumChains = StoreChainDependents.size();
     std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
       StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
@@ -1457,9 +1477,10 @@
       StoreChainDependents.resize(NumChains + 1);
     StoreChainDependents[Result.first->second].push_back(SU);
   }
+
   // Iterate over the store chains.
   for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
-    clusterNeighboringLoads(StoreChainDependents[Idx], DAG);
+    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3054,8 +3075,12 @@
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
   DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI));
-  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
-    DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
+  if (EnableMemOpCluster) {
+    if (DAG->TII->enableClusterLoads())
+      DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
+    if (DAG->TII->enableClusterStores())
+      DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI));
+  }
   if (EnableMacroFusion)
     DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));
   return DAG;
Index: lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.h
+++ lib/Target/AArch64/AArch64InstrInfo.h
@@ -109,7 +109,9 @@
 
   bool enableClusterLoads() const override { return true; }
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+  bool enableClusterStores() const override { return true; }
+
+  bool shouldClusterMemOps(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
                           unsigned NumLoads) const override;
 
   bool shouldScheduleAdjacent(MachineInstr *First,
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1431,6 +1431,11 @@
   case AArch64::LDRWui:
   case AArch64::LDRSWui:
   // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
   case AArch64::LDURSi:
   case AArch64::LDURDi:
   case AArch64::LDURQi:
@@ -1540,15 +1545,20 @@
   default:
     return false;
   case AArch64::LDURQi:
+  case AArch64::STURQi:
     OffsetStride = 16;
     break;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
     OffsetStride = 8;
     break;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
   case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
     OffsetStride = 4;
     break;
   }
@@ -1584,9 +1594,9 @@
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                          MachineInstr *SecondLdSt,
-                                          unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
+                                           MachineInstr *SecondLdSt,
+                                           unsigned NumLoads) const {
   // Only cluster up to a single pair.
   if (NumLoads > 1)
     return false;
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -94,9 +94,9 @@
                              int64_t &Offset,
                              const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt,
-                          MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const final;
+  bool shouldClusterMemOps(MachineInstr *FirstLdSt,
+                           MachineInstr *SecondLdSt,
+                           unsigned NumLoads) const final;
 
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MI, DebugLoc DL,
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -292,9 +292,9 @@
   return false;
 }
 
-bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                     MachineInstr *SecondLdSt,
-                                     unsigned NumLoads) const {
+bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
+                                      MachineInstr *SecondLdSt,
+                                      unsigned NumLoads) const {
 	const MachineOperand *FirstDst = nullptr;
 	const MachineOperand *SecondDst = nullptr;
 
Index: test/CodeGen/AArch64/aarch64-stp-cluster.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i64_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4):   STRXui %vreg1, %vreg0, 1
+; CHECK:SU(3):   STRXui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRXui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRXui %vreg1, %vreg0, 4
+define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+  store i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+  store i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+  store i64 %v, i64* %arrayidx3
+  ret i64 %v
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i32_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4):   STRWui %vreg1, %vreg0, 1
+; CHECK:SU(3):   STRWui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRWui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRWui %vreg1, %vreg0, 4
+define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %P, i32 3
+  store i32 %v, i32* %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2
+  store i32 %v, i32* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1
+  store i32 %v, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4
+  store i32 %v, i32* %arrayidx3
+  ret i32 %v
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i64_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5):   STURXi %vreg1, %vreg0, -32
+; CHECK:SU(2):   STURXi %vreg1, %vreg0, -24
+; CHECK:SU(4):   STURXi %vreg1, %vreg0, -16
+; CHECK:SU(3):   STURXi %vreg1, %vreg0, -8
+define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
+  store i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
+  store i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
+  store i64 %v, i64* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i32_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5):   STURWi %vreg1, %vreg0, -16
+; CHECK:SU(2):   STURWi %vreg1, %vreg0, -12
+; CHECK:SU(4):   STURWi %vreg1, %vreg0, -8
+; CHECK:SU(3):   STURWi %vreg1, %vreg0, -4
+define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
+  store i32 %v, i32* %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
+  store i32 %v, i32* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
+  store i32 %v, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
+  store i32 %v, i32* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_double:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3):   STRDui %vreg1, %vreg0, 1
+; CHECK:SU(4):   STRDui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRDui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRDui %vreg1, %vreg0, 4
+define void @stp_double(double* nocapture %P, double %v)  {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %P, i64 3
+  store double %v, double* %arrayidx
+  %arrayidx1 = getelementptr inbounds double, double* %P, i64 1
+  store double %v, double* %arrayidx1
+  %arrayidx2 = getelementptr inbounds double, double* %P, i64 2
+  store double %v, double* %arrayidx2
+  %arrayidx3 = getelementptr inbounds double, double* %P, i64 4
+  store double %v, double* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_float:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3):   STRSui %vreg1, %vreg0, 1
+; CHECK:SU(4):   STRSui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRSui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRSui %vreg1, %vreg0, 4
+define void @stp_float(float* nocapture %P, float %v)  {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %P, i64 3
+  store float %v, float* %arrayidx
+  %arrayidx1 = getelementptr inbounds float, float* %P, i64 1
+  store float %v, float* %arrayidx1
+  %arrayidx2 = getelementptr inbounds float, float* %P, i64 2
+  store float %v, float* %arrayidx2
+  %arrayidx3 = getelementptr inbounds float, float* %P, i64 4
+  store float %v, float* %arrayidx3
+  ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_volatile:BB#0
+; CHECK-NOT: Cluster ld/st
+; CHECK:SU(2):   STRXui %vreg1, %vreg0, 3; mem:Volatile
+; CHECK:SU(3):   STRXui %vreg1, %vreg0, 2; mem:Volatile
+; CHECK:SU(4):   STRXui %vreg1, %vreg0, 1; mem:Volatile
+; CHECK:SU(5):   STRXui %vreg1, %vreg0, 4; mem:Volatile
+define i64 @stp_volatile(i64* nocapture %P, i64 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+  store volatile i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+  store volatile i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+  store volatile i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+  store volatile i64 %v, i64* %arrayidx3
+  ret i64 %v
+}
+
Index: test/CodeGen/AArch64/arm64-ldp-cluster.ll
===================================================================
--- test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -4,7 +4,7 @@
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int(i32* %a) nounwind {
@@ -19,7 +19,7 @@
 ; Test ldpsw clustering
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_sext_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_sext_int(i32* %p) nounwind {
@@ -35,7 +35,7 @@
 ; Test ldur clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldur_int:BB#0
-; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: Cluster ld/st SU(2) - SU(1)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 define i32 @ldur_int(i32* %a) nounwind {
@@ -50,7 +50,7 @@
 ; Test sext + zext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
@@ -68,7 +68,7 @@
 ; Test zext + sext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
@@ -86,7 +86,7 @@
 ; Verify we don't cluster volatile loads.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int_volatile:BB#0
-; CHECK-NOT: Cluster loads
+; CHECK-NOT: Cluster ld/st
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int_volatile(i32* %a) nounwind {
Index: test/CodeGen/AArch64/arm64-stp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-stp.ll
+++ test/CodeGen/AArch64/arm64-stp.ll
@@ -100,9 +100,9 @@
 
 ; Read of %b to compute %tmp2 shouldn't prevent formation of stp
 ; CHECK-LABEL: stp_int_rar_hazard
-; CHECK: stp w0, w1, [x2]
 ; CHECK: ldr [[REG:w[0-9]+]], [x2, #8]
-; CHECK: add w0, [[REG]], w1
+; CHECK: add w8, [[REG]], w1
+; CHECK: stp w0, w1, [x2]
 ; CHECK: ret
 define i32 @stp_int_rar_hazard(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   store i32 %a, i32* %p, align 4
Index: test/CodeGen/AArch64/global-merge-group-by-use.ll
===================================================================
--- test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -64,8 +64,8 @@
 define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-NEXT: adrp x8, [[SET3]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF
-; CHECK-NEXT: stp w0, w1, [x8, #4]
-; CHECK-NEXT: str w2, [x8]
+; CHECK-NEXT: stp w2, w0, [x8]
+; CHECK-NEXT: str w1, [x8, #8]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m4, align 4
   store i32 %a2, i32* @n4, align 4