Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1342,11 +1342,17 @@
 bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
     MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
     const TargetRegisterInfo *TRI) const {
-  // Handle only loads/stores with base register followed by immediate offset.
-  if (LdSt->getNumOperands() != 3)
-    return false;
-  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+  // Handle only loads/stores with base register followed by immediate offset
+  // or pairs.
+  if (LdSt->getNumOperands() == 3) {
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+        return false;
+  } else if (LdSt->getNumOperands() == 4) {
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isReg() || !LdSt->getOperand(3).isImm())
+      return false;
+  } else {
     return false;
+  }
 
   // Offset is calculated as the immediate operand multiplied by the scaling factor.
   // Unscaled instructions have scaling factor set to 1.
@@ -1392,10 +1398,22 @@
     Width = 1;
     Scale = 1;
     break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    Scale = Width = 32;
+    break;
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
     break;
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
@@ -1422,8 +1440,13 @@
     break;
   }
 
-  BaseReg = LdSt->getOperand(1).getReg();
-  Offset = LdSt->getOperand(2).getImm() * Scale;
+  if (LdSt->getNumOperands() == 3) {
+    BaseReg = LdSt->getOperand(1).getReg();
+    Offset = LdSt->getOperand(2).getImm() * Scale;
+  } else if (LdSt->getNumOperands() == 4) {
+    BaseReg = LdSt->getOperand(2).getReg();
+    Offset = LdSt->getOperand(3).getImm() * Scale;
+  }
   return true;
 }
 
Index: llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
===================================================================
--- llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -67,17 +67,19 @@
 
 ; CHECK-LABEL: f2:
 ; CHECK-EVEN: fmadd [[x:d[0-9]*[02468]]]
-; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]]
 ; CHECK-ODD: fmadd [[x:d[0-9]*[13579]]]
+; CHECK-A57: fmadd [[x]]
+; CHECK-EVEN: fmul [[y:d[0-9]*[13579]]]
 ; CHECK-ODD: fmul [[y:d[0-9]*[02468]]]
-; CHECK: fmadd [[x]]
 ; CHECK: fmadd [[y]]
+; CHECK-A53: fmadd [[x]]
+; CHECK-A53: fmadd [[y]]
 ; CHECK: fmsub [[x]]
-; CHECK: fmadd [[y]]
+; CHECK-A57: fmadd [[y]]
+; CHECK-A53-DAG: str [[y]]
 ; CHECK: fmadd [[x]]
 ; CHECK-A57: stp [[x]], [[y]]
 ; CHECK-A53-DAG: str [[x]]
-; CHECK-A53-DAG: str [[y]]
 
 define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 {
 entry:
@@ -162,17 +164,19 @@
 
 ; CHECK-LABEL: f4:
 ; CHECK-EVEN: fmadd [[x:s[0-9]*[02468]]]
-; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]]
 ; CHECK-ODD: fmadd [[x:s[0-9]*[13579]]]
+; CHECK-A57: fmadd [[x]]
+; CHECK-EVEN: fmul [[y:s[0-9]*[13579]]]
 ; CHECK-ODD: fmul [[y:s[0-9]*[02468]]]
-; CHECK: fmadd [[x]]
 ; CHECK: fmadd [[y]]
+; CHECK-A53: fmadd [[x]]
+; CHECK-A53: fmadd [[y]]
 ; CHECK: fmsub [[x]]
-; CHECK: fmadd [[y]]
+; CHECK-A57: fmadd [[y]]
+; CHECK-A53-DAG: str [[y]]
 ; CHECK: fmadd [[x]]
 ; CHECK-A57: stp [[x]], [[y]]
 ; CHECK-A53-DAG: str [[x]]
-; CHECK-A53-DAG: str [[y]]
 
 define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 {
 entry:
Index: llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
+++ llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
@@ -8,8 +8,8 @@
 define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
 entry:
 ; CHECK: test
-; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
 ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
 ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
 ; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
  %retval = alloca <16 x float>, align 16
Index: llvm/test/CodeGen/AArch64/fastcc.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fastcc.ll
+++ llvm/test/CodeGen/AArch64/fastcc.ll
@@ -20,7 +20,7 @@
 ; CHECK-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack8
-; CHECK-TAIL: sub sp, sp, #16
+; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
@@ -70,7 +70,7 @@
 
 
 ; CHECK-TAIL: bl func_stack8
-; CHECK-TAIL: sub sp, sp, #16
+; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
@@ -113,7 +113,7 @@
 ; CHECK-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack8
-; CHECK-TAIL: sub sp, sp, #16
+; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
Index: llvm/test/CodeGen/AArch64/ldst-pairing.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/ldst-pairing.ll
@@ -0,0 +1,102 @@
+; RUN: llc < %s -march=arm64 -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+; CHECK: stp
+; CHECK: stp
+define void @st1(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) {
+entry:
+  %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index
+  %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1
+  %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2
+  %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3
+
+  store <4 x float> %a, <4 x float> * %a1
+  store <4 x float> %b, <4 x float> * %b1
+  store <4 x float> %c, <4 x float> * %c1
+  store <4 x float> %d, <4 x float> * %d1
+
+  ret void
+}
+
+; CHECK: stp
+; CHECK: stp
+define void @st2(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* %base, i64 %index) {
+entry:
+  %a0 = getelementptr inbounds float, float* %base, i64 %index
+  %b0 = getelementptr float, float* %a0, i64 4
+  %c0 = getelementptr float, float* %a0, i64 8
+  %d0 = getelementptr float, float* %a0, i64 12
+
+  %a1 = bitcast float* %a0 to <4 x float>*
+  %b1 = bitcast float* %b0 to <4 x float>*
+  %c1 = bitcast float* %c0 to <4 x float>*
+  %d1 = bitcast float* %d0 to <4 x float>*
+
+  store <4 x float> %c, <4 x float> * %c1, align 4
+  store <4 x float> %a, <4 x float> * %a1, align 4
+
+  ; This fadd forces the compiler to pair %c and %e after fadd, and leave the
+  ; stores %a and %b separated by a stp. The dependence analysis needs then to
+  ; prove that it is safe to move %b past the stp to be paired with %a.
+  %e = fadd fast <4 x float> %d, %a
+
+  store <4 x float> %e, <4 x float> * %d1, align 4
+  store <4 x float> %b, <4 x float> * %b1, align 4
+
+  ret void
+}
+
+; CHECK: stp
+; CHECK: stp
+define void @st3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) {
+entry:
+  %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index
+  %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1
+  %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2
+  %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3
+
+  store <4 x float> %a, <4 x float> * %a1
+  store <4 x float> %d, <4 x float> * %d1
+  store <4 x float> %c, <4 x float> * %c1
+  store <4 x float> %b, <4 x float> * %b1
+
+  ret void
+}
+
+; CHECK: stp
+; CHECK: stp
+define void @st4(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) {
+entry:
+  %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index
+  %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1
+  %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2
+  %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3
+
+  store <4 x float> %d, <4 x float> * %d1
+  store <4 x float> %b, <4 x float> * %b1
+
+  %e = fadd fast <4 x float> %c, %a
+
+  store <4 x float> %e, <4 x float> * %c1
+  store <4 x float> %a, <4 x float> * %a1
+
+  ret void
+}
+
+; FIXME: st5 should contain two stp. The current pairing algorithm is greedy and
+; is pairing %c and %b, leaving the two other stores %a and %d not pairable.
+
+; CHECK: stp
+define void @st5(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> * %base, i64 %index) {
+entry:
+  %a1 = getelementptr inbounds <4 x float>, <4 x float>* %base, i64 %index
+  %b1 = getelementptr <4 x float>, <4 x float>* %a1, i64 1
+  %c1 = getelementptr <4 x float>, <4 x float>* %a1, i64 2
+  %d1 = getelementptr <4 x float>, <4 x float>* %a1, i64 3
+
+  store <4 x float> %c, <4 x float> * %c1
+  store <4 x float> %a, <4 x float> * %a1
+  store <4 x float> %b, <4 x float> * %b1
+  store <4 x float> %d, <4 x float> * %d1
+
+  ret void
+}