Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
===================================================================
--- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -183,6 +183,18 @@
   return isSmallTypeLdMerge(MI->getOpcode());
 }
 
+// This function is called by findMatchingInsn.
+// Only check the target instruction is load/store and ScanLimit is 1 or not.
+static bool isRightNextLdStMerge(MachineInstr *MI, unsigned Limit) {
+  if (!MI->mayStore() && !MI->mayLoad() && !MI->isCall() &&
+      !MI->hasUnmodeledSideEffects())
+    return false;
+
+  return Limit == 1;
+
+  return false;
+}
+
 // Scaling factor for unscaled load or store.
 static int getMemScale(MachineInstr *MI) {
   switch (MI->getOpcode()) {
@@ -756,6 +768,20 @@
   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
   int Offset = getLdStOffsetOp(FirstMI).getImm();
 
+  // If first instruction is volatile load/store, we only check just
+  // right next instruction. For this, we just set Limit value to 1.
+  // That means we just check right next instruction for merging.
+  //   e.g.,
+  //     ldr x0, [x2]     -> volatile
+  //     ldr x1, [x2, #8] -> this is just right next instruction of first.
+  //     ; becomes
+  //     ldp x0, x1, [x2] -> if memory reference information is empty,
+  //                         it is also treated volatile access.
+  //   Opposite situation or both of ldr/str instructions are volatile is
+  //   also ok.
+  if (FirstMI->hasOrderedMemoryRef())
+    Limit = 1;
+
   // Early exit if the first instruction modifies the base register.
   // e.g., ldr x0, [x0]
   if (FirstMI->modifiesRegister(BaseReg, TRI))
@@ -820,7 +846,13 @@
         // If this is a volatile load/store that otherwise matched, stop looking
         // as something is going on that we don't have enough information to
         // safely transform. Similarly, stop if we see a hint to avoid pairs.
-        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+        // But if the Limit is 1 which means we try to merge just right next
+        // load/store instruction, we can merge volatile load/store.
+        // There is no semantic difference even if the order is changed.
+        // Pairwise instruction doesn't have memory reference information
+        // which treated volatile access.
+        if ((!isRightNextLdStMerge(MI, Limit) && MI->hasOrderedMemoryRef()) ||
+            TII->isLdStPairSuppressed(MI))
           return E;
         // If the resultant immediate offset of merging these instructions
         // is out of range for a pairwise instruction, bail and keep looking.
@@ -1135,9 +1167,6 @@
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr *MI = MBBI;
   MachineBasicBlock::iterator E = MI->getParent()->end();
-  // If this is a volatile load/store, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
-    return false;
 
   // Make sure this is a reg+imm (as opposed to an address reloc).
   if (!getLdStOffsetOp(MI).isImm())
Index: test/CodeGen/AArch64/arm64-ldp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-ldp.ll
+++ test/CodeGen/AArch64/arm64-ldp.ll
@@ -356,3 +356,22 @@
   ret i64 %add
 }
 
+; Test the load/store optimizer---combine adjacent volatile ldr
+; CHECK-LABEL: volatile_ldp_long
+; CHECK: ldr
+; CHECK: ldp
+; CHECK: ldr
+define i64 @volatile_ldp_long(i64* %p) nounwind {
+  %add.ptr = getelementptr inbounds i64, i64* %p, i64 0
+  %tmp = load volatile i64, i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2
+  %tmp1 = load volatile i64, i64* %add.ptr1, align 8
+  %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3
+  %tmp2 = load volatile i64, i64* %add.ptr2, align 8
+  %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1
+  %tmp3 = load volatile i64, i64* %add.ptr3, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  %add2 = add nsw i64 %tmp2, %add
+  %add3 = add nsw i64 %tmp3, %add2
+  ret i64 %add3
+}
Index: test/CodeGen/AArch64/arm64-platform-reg.ll
===================================================================
--- test/CodeGen/AArch64/arm64-platform-reg.ll
+++ test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -14,14 +14,14 @@
   %val = load volatile [30 x i64], [30 x i64]* @var
   store volatile [30 x i64] %val, [30 x i64]* @var
 
-; CHECK: ldr x18
-; CHECK: str x18
+; CHECK: ldp x17, x18
+; CHECK: stp x17, x18
 
-; CHECK-RESERVE-X18-NOT: ldr fp
-; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18-NOT: ldp x17, fp
+; CHECK-RESERVE-X18-NOT: ldp x17, x18
 ; CHECK-RESERVE-X18: Spill
-; CHECK-RESERVE-X18-NOT: ldr fp
-; CHECK-RESERVE-X18-NOT: ldr x18
+; CHECK-RESERVE-X18-NOT: ldp x17, fp
+; CHECK-RESERVE-X18-NOT: ldp x17, x18
 ; CHECK-RESERVE-X18: ret
   ret void
 }
Index: test/CodeGen/AArch64/arm64-stp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-stp.ll
+++ test/CodeGen/AArch64/arm64-stp.ll
@@ -129,3 +129,20 @@
   store i32 %b, i32* %add.ptr, align 4
   ret i32 %tmp2
 }
+
+; Test the load/store optimizer---combine adjacent volatile str
+; CHECK-LABEL: volatile_stp_long
+; CHECK: str x0, [x4]
+; CHECK: stp x1, x2, [x4, #16]
+; CHECK: str x3, [x4, #8]
+define void @volatile_stp_long(i64 %a, i64 %b, i64 %c, i64 %d,i64* nocapture %p) nounwind {
+  %add.ptr = getelementptr inbounds i64, i64* %p, i64 0
+  store volatile i64 %a, i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64, i64* %p, i64 2
+  store volatile i64 %b, i64* %add.ptr1, align 8
+  %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 3
+  store volatile i64 %c, i64* %add.ptr2, align 8
+  %add.ptr3 = getelementptr inbounds i64, i64* %p, i64 1
+  store volatile i64 %d, i64* %add.ptr3, align 8
+  ret void
+}
Index: test/CodeGen/AArch64/arm64-volatile.ll
===================================================================
--- test/CodeGen/AArch64/arm64-volatile.ll
+++ test/CodeGen/AArch64/arm64-volatile.ll
@@ -2,26 +2,45 @@
 define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
 ; CHECK: normal_load
 ; CHECK: ldp
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: add
 ; CHECK-NEXT: add
 ; CHECK-NEXT: ret
   %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1
   %tmp = load i64, i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2
+  %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5
   %tmp1 = load i64, i64* %add.ptr1, align 8
+  %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2
+  %tmp2 = load i64, i64* %add.ptr2, align 8
+  %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4
+  %tmp3 = load i64, i64* %add.ptr3, align 8
   %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
+  %add2 = add nsw i64 %tmp3, %tmp2
+  %add3 =  add nsw i64 %add, %add2
+  ret i64 %add3
 }
 
 define i64 @volatile_load(i64* nocapture %bar) nounwind {
 ; CHECK: volatile_load
 ; CHECK: ldr
 ; CHECK-NEXT: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: add
 ; CHECK-NEXT: add
 ; CHECK-NEXT: ret
   %add.ptr = getelementptr inbounds i64, i64* %bar, i64 1
   %tmp = load volatile i64, i64* %add.ptr, align 8
-  %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 2
+  %add.ptr1 = getelementptr inbounds i64, i64* %bar, i64 5
   %tmp1 = load volatile i64, i64* %add.ptr1, align 8
+  %add.ptr2 = getelementptr inbounds i64, i64* %bar, i64 2
+  %tmp2 = load volatile i64, i64* %add.ptr2, align 8
+  %add.ptr3 = getelementptr inbounds i64, i64* %bar, i64 4
+  %tmp3 = load volatile i64, i64* %add.ptr3, align 8
   %add = add nsw i64 %tmp1, %tmp
-  ret i64 %add
+  %add2 = add nsw i64 %tmp3, %tmp2
+  %add3 =  add nsw i64 %add, %add2
+  ret i64 %add3
 }