Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp
===================================================================
--- lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -61,6 +61,14 @@
 /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
 /// load / store instructions to form ldm / stm instructions.
 
+static cl::opt<unsigned> ScanLimit("arm-load-store-scan-limit",
+                                   cl::init(20), cl::Hidden);
+
+static cl::opt<bool> AlwaysCollapseToLoadStoreDouble("arm-load-store-use-ldrd-strd",
+    cl::Hidden, 
+    cl::desc("Always try and collapse load/store pairs into ldrd/strd's if" \
+      "available on target architecture"), cl::init(true));
+
 namespace {
   struct ARMLoadStoreOpt : public MachineFunctionPass {
     static char ID;
@@ -139,8 +147,19 @@
                                    MachineBasicBlock::iterator MBBI,
                                    bool &Advance,
                                    MachineBasicBlock::iterator &I);
+    // Merge the two instructions indicated into a single pair-wise instruction.
+    // If MergeForward is true, erase the first instruction and fold its
+    // operation into the second. If false, the reverse. Return the instruction
+    // following the first instruction (which may change during processing).
+    // -1 means none, 0 means I, and 1 means Paired.
+    MachineBasicBlock::iterator
+    mergePairedInsns(MachineBasicBlock::iterator I,
+                     MachineBasicBlock::iterator Paired, bool MergeForward);
     bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool LoadStoreToDoubleOpti(MachineBasicBlock &MBB);
     bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+    MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
+        bool &MergeForward, unsigned Limit);
   };
   char ARMLoadStoreOpt::ID = 0;
 }
@@ -371,6 +390,75 @@
     return 4;
   }
 }
+static unsigned getMatchingPairOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no pairwise equivalent!");
+  case ARM::t2LDRi12:
+    return ARM::t2LDRDi8;
+  case ARM::t2STRi12:
+    return ARM::t2STRDi8;
+  }
+}
+
+MachineBasicBlock::iterator
+ARMLoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                  MachineBasicBlock::iterator Paired,
+                                  bool MergeForward) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  unsigned Opc = I->getOpcode();
+  int OffsetStride = 4;
+
+  unsigned NewOpc = getMatchingPairOpcode(Opc);
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  MachineOperand &BaseRegOp =
+      MergeForward ? Paired->getOperand(1) : I->getOperand(1);
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (I->getOperand(2).getImm() ==
+      Paired->getOperand(2).getImm() + OffsetStride) {
+    RtMI = Paired;
+    Rt2MI = I;
+  } else {
+    RtMI = I;
+    Rt2MI = Paired;
+  }
+  int OffsetImm = RtMI->getOperand(2).getImm();
+
+  // Construct the new instruction.
+  MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint,
+                                    I->getDebugLoc(), TII->get(NewOpc))
+                                .addOperand(RtMI->getOperand(0))
+                                .addOperand(Rt2MI->getOperand(0))
+                                .addOperand(BaseRegOp)
+                                .addImm(OffsetImm);
+  AddDefaultPred(MIB);
+
+  DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(Paired->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  Paired->eraseFromParent();
+
+  return NextI;
+}
 
 /// Update future uses of the base register with the offset introduced
 /// due to writeback. This function only works on Thumb1.
@@ -1590,6 +1678,198 @@
   return false;
 }
 
+/// trackRegDefsUses - Remember what registers the specified instruction uses
+/// and modifies.
+static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs,
+                             BitVector &UsedRegs,
+                             const TargetRegisterInfo *TRI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isRegMask())
+      ModifiedRegs.setBitsNotInMask(MO.getRegMask());
+
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned Reg = MO.getReg();
+    if (MO.isDef()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        ModifiedRegs.set(*AI);
+    } else {
+      assert(MO.isUse() && "Reg operand not a def and not a use?!?");
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        UsedRegs.set(*AI);
+    }
+  }
+}
+
+/// findMatchingInsn - Scan the instructions looking for a load/store that can
+/// be combined with the current instruction into a load/store pair.
+MachineBasicBlock::iterator
+ARMLoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                  bool &MergeForward, unsigned Limit) {
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator MBBI = I;
+  MachineInstr *FirstMI = I;
+  ++MBBI;
+
+  int Opc = FirstMI->getOpcode();
+  bool MayLoad = FirstMI->mayLoad();
+  unsigned Reg = FirstMI->getOperand(0).getReg();
+  unsigned BaseReg = FirstMI->getOperand(1).getReg();
+  int Offset = FirstMI->getOperand(2).getImm();
+
+  // Early exit if the first instruction modifies the base register.
+  if (FirstMI->modifiesRegister(BaseReg, TRI))
+    return E;
+
+  int OffsetStride = 4;
+
+  // Track which registers have been modified and used between the first insn
+  // (inclusive) and the second insn.
+  BitVector ModifiedRegs, UsedRegs;
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr *MI = MBBI;
+
+    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+    // optimization by changing how far we scan.
+    if (MI->isDebugValue())
+      continue;
+
+    // Now that we know this is a real instruction, count it.
+    ++Count;
+
+    bool CanMergeOpc = Opc == MI->getOpcode();
+
+    if (CanMergeOpc && MI->getOperand(2).isImm()) {
+      // If we've found another instruction with the same opcode, check to see
+      // if the base and offset are compatible with our starting instruction.
+      unsigned MIBaseReg = MI->getOperand(1).getReg();
+      int MIOffset = MI->getOperand(2).getImm();
+      if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
+                                   (Offset + OffsetStride == MIOffset))) {
+        int MinOffset = Offset < MIOffset ? Offset : MIOffset;
+        // If this is a volatile load/store that otherwise matched, stop looking
+        // as something is going on that we don't have enough information to
+        // safely transform. Similarly, stop if we see a hint to avoid pairs.
+        if (MI->hasOrderedMemoryRef())
+          return E;
+
+        // If the destination register of the loads is the same register, bail
+        // and keep looking. A load-pair instruction with both destination
+        // registers the same is UNPREDICTABLE and will result in an exception.
+        if (MayLoad && Reg == MI->getOperand(0).getReg()) {
+          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+          continue;
+        }
+
+        // Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base
+        // register when interrupted or faulted.
+        if (STI->isCortexM3() && MI->modifiesRegister(BaseReg, TRI))
+          return E;
+
+        // If the Rt of the second instruction was not modified or used between
+        // the two instructions, we can combine the second into the first.
+        if (!ModifiedRegs[MI->getOperand(0).getReg()] &&
+            !UsedRegs[MI->getOperand(0).getReg()]) {
+          MergeForward = false;
+          return MBBI;
+        }
+
+        // Likewise, if the Rt of the first instruction is not modified or used
+        // between the two instructions, we can combine the first into the
+        // second.
+        if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] &&
+            !UsedRegs[FirstMI->getOperand(0).getReg()]) {
+          MergeForward = true;
+          return MBBI;
+        }
+        // Unable to combine these instructions due to interference in between.
+        // Keep looking.
+      }
+    }
+
+    // If the instruction wasn't a matching load or store, but does (or can)
+    // modify memory, stop searching, as we don't have alias analysis or
+    // anything like that to tell us whether the access is tromping on the
+    // locations we care about. The big one we want to catch is calls.
+    //
+    // FIXME: Theoretically, we can do better than that for SP and FP based
+    // references since we can effectively know where those are touching. It's
+    // unclear if it's worth the extra code, though. Most paired instructions
+    // will be sequential, perhaps with a few intervening non-memory related
+    // instructions.
+    if (MI->mayStore() || MI->isCall())
+      return E;
+    // Likewise, if we're matching a store instruction, we don't want to
+    // move across a load, as it may be reading the same location.
+    if (FirstMI->mayStore() && MI->mayLoad())
+      return E;
+
+    // Update modified / uses register lists.
+    trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+    // Otherwise, if the base register is modified, we have no match, so
+    // return early.
+    if (ModifiedRegs[BaseReg])
+      return E;
+  }
+  return E;
+}
+
+// FIXME: Currently, only supports collapsing ldr/str's to ldrd/strd's for
+// V7M based cores. V7A and V7R architectures also support ldrd/strd instruction
+// with a few restrictions, for example for the ldrd instruction
+// the first destination register must be an even numbered register and
+// second register must be (first register number + 1). We should update
+// the code at some point to make it possible to generate ldrd/strd for
+// these architectuers as well.
+bool ARMLoadStoreOpt::LoadStoreToDoubleOpti(MachineBasicBlock &MBB) {
+  if (!isThumb2 || !STI->hasV7Ops() || !STI->isMClass()) {
+      return false;
+  }
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    MachineInstr *MI = MBBI;
+    switch(MI->getOpcode()) {
+      default:
+        // Just move on to the next instruction
+        ++MBBI;
+        break;
+      case ARM::t2LDRi12:
+      case ARM::t2STRi12: {
+        // If this is a volatile load/store, don't mess with it.
+        if (MI->hasOrderedMemoryRef()) {
+          ++MBBI;
+          break;
+        }
+        // Make sure this is a reg+imm (as opposed to an address reloc).
+        if (!MI->getOperand(2).isImm()) {
+          ++MBBI;
+          break;
+        }
+        // Look ahead up to ScanLimit instructions for a pairable instruction.
+        bool MergeForward = false;
+        MachineBasicBlock::iterator Paired =
+            findMatchingInsn(MBBI, MergeForward, ScanLimit);
+        if (Paired != E) {
+          // Merge the loads into a pair. Keeping the iterator straight is a
+          // pain, so we let the merge routine tell us what the next instruction
+          // is after it's done mucking about.
+          MBBI = mergePairedInsns(MBBI, Paired, MergeForward);
+          Modified = true;
+          break;
+        }
+      }
+      ++MBBI;
+      break;
+    }
+  }
+  return Modified;
+}
+
 /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
 /// ops of the same base and incrementing offset into LDM / STM ops.
 bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
@@ -1828,9 +2108,13 @@
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
        ++MFI) {
     MachineBasicBlock &MBB = *MFI;
-    Modified |= LoadStoreMultipleOpti(MBB);
-    if (STI->hasV5TOps())
-      Modified |= MergeReturnIntoLDM(MBB);
+    if (AlwaysCollapseToLoadStoreDouble)
+      Modified |= LoadStoreToDoubleOpti(MBB);
+    if (!Modified) {
+      Modified |= LoadStoreMultipleOpti(MBB);
+      if (STI->hasV5TOps())
+        Modified |= MergeReturnIntoLDM(MBB);
+    }
   }
 
   delete RS;
Index: test/CodeGen/ARM/ldrd.ll
===================================================================
--- test/CodeGen/ARM/ldrd.ll
+++ test/CodeGen/ARM/ldrd.ll
@@ -18,6 +18,7 @@
 
 ; M3-LABEL: t:
 ; M3-NOT: ldrd
+; M3: umull
 
 	%0 = load i64*, i64** @b, align 4
 	%1 = load i64, i64* %0, align 4
Index: test/CodeGen/Thumb2/aapcs.ll
===================================================================
--- test/CodeGen/Thumb2/aapcs.ll
+++ test/CodeGen/Thumb2/aapcs.ll
@@ -33,8 +33,7 @@
 
 define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) {
 ; CHECK-LABEL: double_on_stack:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i
@@ -42,8 +41,7 @@
 
 define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) {
 ; CHECK-LABEL: double_not_split:
-; SOFT: ldr r0, [sp, #48]
-; SOFT: ldr r1, [sp, #52]
+; SOFT: ldrd r0, r1, [sp, #48]
 ; HARD: vldr d0, [sp]
 ; CHECK-NEXT: bx lr
   ret double %i
Index: test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll
===================================================================
--- test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll
+++ test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 -mcpu=cortex-m7 | FileCheck %s
+@d = external global [64 x i32]
+@s = external global [64 x i32]
+
+; Function Attrs: nounwind
+define void @t1() #0 {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldrd
+; CHECK-NEXT: ldrd
+; CHECK-NEXT: strd
+; CHECK-NEXT: strd
+; CHECK-NEXT: ldrb
+; CHECK-NEXT: strb
+    tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false)
+    ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1