Index: lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.h
+++ lib/Target/AArch64/AArch64InstrInfo.h
@@ -93,6 +93,9 @@
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(MachineInstr *MI) const;
 
+  /// Return true if this is a load/store that can be potentially paired/merged.
+  bool isCandidateToMergeOrPair(MachineInstr *MI) const;
+
   /// Hint that pairing the given load or store is unprofitable.
   void suppressLdStPair(MachineInstr *MI) const;
 
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1342,6 +1342,32 @@
   return isUnscaledLdSt(MI->getOpcode());
 }
 
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI->hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  if (!MI->getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI->getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI->modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  return true;
+}
+
 bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
     MachineInstr *LdSt, unsigned &BaseReg, int64_t &Offset,
     const TargetRegisterInfo *TRI) const {
@@ -1359,6 +1385,14 @@
   case AArch64::LDRQui:
   case AArch64::LDRXui:
   case AArch64::LDRWui:
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
     unsigned Width;
     return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
   };
@@ -1429,6 +1463,7 @@
     break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
+  case AArch64::LDRSWui:
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
@@ -1452,6 +1487,56 @@
   return true;
 }
 
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
@@ -1461,16 +1546,36 @@
   // Only cluster up to a single pair.
   if (NumLoads > 1)
     return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt->getOpcode();
+  unsigned SecondOpc = SecondLdSt->getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+     return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // Operand 2 .
+  int64_t Offset1 = FirstLdSt->getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+     return false;
+
+  int64_t Offset2 = SecondLdSt->getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
     return false;
-  // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
     return false;
-  // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
+
+   // The caller should already have ordered First/SecondLdSt by offset.
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
+
 }
 
 bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -146,10 +146,6 @@
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
-  // Is this a candidate for ld/st merging or pairing?  For example, we don't
-  // touch volatiles or load/stores that have a hint to avoid pair formation.
-  bool isCandidateToMergeOrPair(MachineInstr *MI);
-
   // Find and merge foldable ldr/str instructions.
   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -1588,29 +1584,6 @@
   return false;
 }
 
-bool AArch64LoadStoreOpt::isCandidateToMergeOrPair(MachineInstr *MI) {
-  // If this is a volatile load/store, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
-    return false;
-
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  if (!getLdStOffsetOp(MI).isImm())
-    return false;
-
-  // Can't merge/pair if the instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  unsigned BaseReg = getLdStBaseOp(MI).getReg();
-  if (MI->modifiesRegister(BaseReg, TRI))
-    return false;
-
-  // Check if this load/store has a hint to avoid pair formation.
-  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
-  if (TII->isLdStPairSuppressed(MI))
-    return false;
-
-  return true;
-}
-
 // Find narrow loads that can be converted into a single wider load with
 // bitfield extract instructions.  Also merge adjacent zero stores into a wider
 // store.
@@ -1621,7 +1594,7 @@
   MachineInstr *MI = MBBI;
   MachineBasicBlock::iterator E = MI->getParent()->end();
 
-  if (!isCandidateToMergeOrPair(MI))
+  if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
   // For promotable zero stores, the stored value should be WZR.
@@ -1653,7 +1626,7 @@
   MachineInstr *MI = MBBI;
   MachineBasicBlock::iterator E = MI->getParent()->end();
 
-  if (!isCandidateToMergeOrPair(MI))
+  if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
   // Early exit if the offset is not possible to match. (6 bits of positive
Index: test/CodeGen/AArch64/arm64-ldp-cluster.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -0,0 +1,99 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+
+; Test ldr clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 2
+  %tmp2 = load i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+; Test ldpsw clustering
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_sext_int:BB#0
+; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_sext_int(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+; Test ldur clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldur_int:BB#0
+; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
+define i32 @ldur_int(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 -1
+  %tmp1 = load i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 -2
+  %tmp2 = load i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+; Test sext + zext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
+; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
+  %tmp0 = load i64, i64* %q, align 4
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = zext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  %add1 = add nsw i64 %add, %tmp0
+  ret i64 %add1
+}
+
+; Test zext + sext clustering.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
+; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
+; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
+define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
+  %tmp0 = load i64, i64* %q, align 4
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = zext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  %add1 = add nsw i64 %add, %tmp0
+  ret i64 %add1
+}
+
+; Verify we don't cluster volatile loads.
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: ldr_int_volatile:BB#0
+; CHECK-NOT: Cluster loads
+; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
+; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
+define i32 @ldr_int_volatile(i32* %a) nounwind {
+  %p1 = getelementptr inbounds i32, i32* %a, i32 1
+  %tmp1 = load volatile i32, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32, i32* %a, i32 2
+  %tmp2 = load volatile i32, i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}