Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1199,6 +1199,15 @@
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// \brief Get maximum # of store operations to be glued together
+  ///
+  /// This function returns the maximum number of store operations permitted
+  /// to glue together during lowering of llvm.memcpy. The value is set by
+  //  the target at the performance threshold for such a replacement.
+  virtual unsigned getMaxGluedStoresPerMemcpy() const {
+    return MaxGluedStoresPerMemcpy;
+  }
+
   /// Get maximum # of load operations permitted for memcmp
   ///
   /// This function returns the maximum number of load operations permitted
@@ -2509,6 +2518,14 @@
   /// constant size.
   unsigned MaxStoresPerMemcpy;
 
+
+  /// \brief Specify max number of store instructions to glue in inlined memcpy.
+  ///
+  /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
+  /// of store instructions to keep together. This helps in pairing and 
+  //  vectorization later on.
+  unsigned MaxGluedStoresPerMemcpy = 0;
+
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -89,6 +89,14 @@
 
 #define DEBUG_TYPE "selectiondag"
 
+static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
+       cl::Hidden, cl::init(true),
+       cl::desc("Gang up loads and stores generated by inlining of memcpy"));
+
+static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
+       cl::desc("Number limit for gluing ld/st of memcpy."),
+       cl::Hidden, cl::init(0));
+
 static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
   LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
 }
@@ -5218,6 +5226,31 @@
   return MF.getFunction().optForSize();
 }
 
+static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
+                          SmallVector<SDValue, 32> &OutChains, unsigned From,
+                          unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
+                          SmallVector<SDValue, 16> &OutStoreChains) {
+  assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
+  assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
+  SmallVector<SDValue, 16> GluedLoadChains;
+  for (unsigned i = From; i < To; ++i) {
+    OutChains.push_back(OutLoadChains[i]);
+    GluedLoadChains.push_back(OutLoadChains[i]);
+  }
+
+  // Chain for all loads.
+  SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  GluedLoadChains);
+
+  for (unsigned i = From; i < To; ++i) {
+    StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
+    SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+    OutChains.push_back(NewStore);
+  }
+}
+
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
                                        uint64_t Size, unsigned Align,
@@ -5282,7 +5315,9 @@
 
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
-  SmallVector<SDValue, 8> OutChains;
+  SmallVector<SDValue, 16> OutLoadChains;
+  SmallVector<SDValue, 16> OutStoreChains;
+  SmallVector<SDValue, 32> OutChains;
   unsigned NumMemOps = MemOps.size();
   uint64_t SrcOff = 0, DstOff = 0;
   for (unsigned i = 0; i != NumMemOps; ++i) {
@@ -5316,11 +5351,13 @@
         SubSlice.Length = VTSize;
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
-      if (Value.getNode())
+      if (Value.getNode()) {
         Store = DAG.getStore(Chain, dl, Value,
                              DAG.getMemBasePlusOffset(Dst, DstOff, dl),
                              DstPtrInfo.getWithOffset(DstOff), Align,
                              MMOFlags);
+        OutChains.push_back(Store);
+      }
     }
 
     if (!Store.getNode()) {
@@ -5342,17 +5379,61 @@
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
                              MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
-      OutChains.push_back(Value.getValue(1));
+      OutLoadChains.push_back(Value.getValue(1));
+
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
           DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+      OutStoreChains.push_back(Store);
     }
-    OutChains.push_back(Store);
     SrcOff += VTSize;
     DstOff += VTSize;
     Size -= VTSize;
   }
 
+  unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
+                                TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
+  unsigned NumLdStInMemcpy = OutStoreChains.size();
+
+  if (NumLdStInMemcpy) {
+    // It may be that memcpy might be converted to memset if it's memcpy
+    // of constants. In such a case, we won't have loads and stores, but
+    // just stores. In the absence of loads, there is nothing to gang up.
+    if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
+      // If target does not care, just leave as it.
+      for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
+        OutChains.push_back(OutLoadChains[i]);
+        OutChains.push_back(OutStoreChains[i]);
+      }
+    } else {
+      // Ld/St less than/equal limit set by target.
+      if (NumLdStInMemcpy <= GluedLdStLimit) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        NumLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+      } else {
+        unsigned NumberLdChain =  NumLdStInMemcpy / GluedLdStLimit;
+        unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
+        unsigned GlueIter = 0;
+
+        for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
+          unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
+          unsigned IndexTo   = NumLdStInMemcpy - GlueIter;
+
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
+                                       OutLoadChains, OutStoreChains);
+          GlueIter += GluedLdStLimit;
+        }
+
+        // Residual ld/st.
+        if (RemainingLdStInMemcpy) {
+          chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
+                                        RemainingLdStInMemcpy, OutLoadChains,
+                                        OutStoreChains);
+        }
+      }
+    }
+  }
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
Index: llvm/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -535,6 +535,7 @@
   // Perform these initializations only once.
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
       MaxLoadsPerMemcmp = 8;
+  MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -580,6 +580,8 @@
   setTargetDAGCombine(ISD::GlobalAddress);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxGluedStoresPerMemcpy = 4;
+
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
Index: llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
+++ llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
@@ -8,9 +8,9 @@
 
 ; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
 ; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
 ; CHECK-NEXT: str  [[VAL]], [x0, #8]
-; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: str  [[VAL2]], [x0]
 
 define void @foo(i8* %a) {
Index: llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll
+++ llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll
@@ -29,10 +29,10 @@
 define void @t1(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
-; CHECK: stur [[DEST]], [x0, #15]
 ; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
-; CHECK: str [[DEST]], [x0]
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST:q[0-9]+]], [x0, #15]
+; CHECK: str [[DEST:q[0-9]+]], [x0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false)
   ret void
 }
@@ -52,9 +52,9 @@
 define void @t3(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t3:
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
 ; CHECK: str [[REG4]], [x0, #16]
-; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
 ; CHECK: str [[DEST]], [x0]
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false)
   ret void
Index: llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -130,12 +130,9 @@
 
 ; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
 
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]]
 ; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
-; CHECK: str [[BLOCK]], [x[[DST]]]
-
-; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
-; CHECK: str [[BLOCK]], [x[[DST]], #16]
+; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]]
   ret void
 ; CHECK: ret
 }
Index: llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
+++ llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -34,8 +34,8 @@
 define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
 ; CHECK: Precompute_Patch_Values
 ; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
-; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
 ; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
 entry:
   %Control_Points = alloca [16 x [3 x double]], align 8
Index: llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
===================================================================
--- llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
+++ llvm/test/CodeGen/AArch64/mergestores_noimplicitfloat.ll
@@ -4,16 +4,14 @@
 target triple = "arm64-apple-ios10.0.0"
 
 ; PR33475 - Expect 64-bit operations as 128-operations are not legal
+; However, we can generate a paired 64-bit loads and stores, without using
+; floating point registers.
 
 ; CHECK-LABEL: pr33475
-; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1]
-; CHECK-DAG: str [[R0]], [x0]
-; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8]
-; CHECK-DAG: str [[R1]], [x0, #8]
-; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16]
-; CHECK-DAG: str [[R2]], [x0, #16]
-; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24]
-; CHECK-DAG: str [[R3]], [x0, #24]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16]
+; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16]
+; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0]
 
 define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat {
     call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false)
Index: llvm/test/CodeGen/ARM/memcpy-inline.ll
===================================================================
--- llvm/test/CodeGen/ARM/memcpy-inline.ll
+++ llvm/test/CodeGen/ARM/memcpy-inline.ll
@@ -44,15 +44,14 @@
 define void @t2(i8* nocapture %C) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
+; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: movs [[INC:r[0-9]+]], #32
-; CHECK: add.w   r3, r0, #16
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
 ; CHECK: movw [[REG2:r[0-9]+]], #16716
 ; CHECK: movt [[REG2:r[0-9]+]], #72
 ; CHECK: str [[REG2]], [r0]
-; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
-; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
 ; CHECK-T1-LABEL: t2:
 ; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)