Index: lib/Target/AArch64/AArch64FrameLowering.h
===================================================================
--- lib/Target/AArch64/AArch64FrameLowering.h
+++ lib/Target/AArch64/AArch64FrameLowering.h
@@ -69,6 +69,11 @@
 
   bool enableStackSlotScavenging(const MachineFunction &MF) const override;
 
+  /// Order the symbols in the local stack in an attempt to create more paired
+  /// load/store instructions.
+  void orderFrameObjects(const MachineFunction &MF,
+                         SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       unsigned StackBumpBytes) const;
Index: lib/Target/AArch64/AArch64FrameLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64FrameLowering.cpp
+++ lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -105,6 +105,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <deque>
 
 using namespace llvm;
 
@@ -1198,3 +1199,250 @@
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   return AFI->hasCalleeSaveStackFreeSpace();
 }
+
+// Order stack objects so that more load/store instructions referencing them can
+// be merged into load/store pair instructions.
+void AArch64FrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+
+  // Don't waste time if there's nothing to do.
+  if (ObjectsToAllocate.empty())
+    return;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  auto &TII = *static_cast<const AArch64InstrInfo *>(Subtarget.getInstrInfo());
+
+  // Quick access to object size and whether object was in initial
+  // ObjectsToAllocate array.  Only objects with a non-zero value in this array
+  // are of interest in this function.
+  SmallVector<unsigned, 16> ObjectsToAllocSize(
+      static_cast<size_t>(MFI.getObjectIndexEnd()), 0);
+  for (auto &Obj : ObjectsToAllocate)
+    ObjectsToAllocSize[Obj] = MFI.getObjectSize(Obj);
+
+  // Canonicalized pair to be sorted for use in non-directed graph edge key.
+  struct IdxPair : public std::pair<int, int> {
+    IdxPair(int Idx1, int Idx2) : std::pair<int, int>(Idx1, Idx2) {
+      if (Idx1 > Idx2)
+        std::swap(first, second);
+    }
+  };
+  // Weighted non-directed graph of stack slot pair affinities.  Weights
+  // indicate how frequently the pair of stack slots are referenced in the same
+  // block region.
+  std::map<IdxPair, uint64_t> Affinities;
+
+  // Indexed by Load/Store, regclass.  BlockRefs[L/S][RegClass] is a list of the
+  // load/store references to/from the specified regclass in a given block
+  // region.
+  const unsigned NumPairableRegClasses = 5;
+  SmallVector<int, 16> BlockRefs[2][NumPairableRegClasses];
+
+  auto updateBlockRefAffinitiesWeights = [&](SmallVectorImpl<int> &BlockRefs) {
+    auto N = BlockRefs.size();
+    for (unsigned I = 0; I < N; ++I) {
+      int IIdx = BlockRefs[I];
+      unsigned ISize = ObjectsToAllocSize[IIdx];
+      // Only relatively small sized slots are likely load/store pair
+      // candidates.
+      if (ISize == 0 || ISize > 16)
+        continue;
+      for (unsigned J = I + 1; J < N; ++J) {
+        int JIdx = BlockRefs[J];
+        if (IIdx != JIdx &&
+            // Only same sized slots are likely load/store pair candidates.
+            ISize == ObjectsToAllocSize[JIdx])
+          Affinities[IdxPair(IIdx, JIdx)]++;
+      }
+    }
+    BlockRefs.clear();
+  };
+
+  // Update the affinity graph.  For all pairs of potentially pair-able slots
+  // referenced in the same block, increment the weight of the edge between
+  // them.
+  auto updateBlockRefAffinities = [&]() {
+    for (unsigned LoadStIdx = 0; LoadStIdx < 2; ++LoadStIdx)
+      for (unsigned RegClassIdx = 0; RegClassIdx < NumPairableRegClasses;
+           ++RegClassIdx)
+        updateBlockRefAffinitiesWeights(BlockRefs[LoadStIdx][RegClassIdx]);
+  };
+
+  // Build the weighted affinity graph by looking for pair-able accesses.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      // Treat calls as barriers chopping block into two separate block regions
+      // since we can't form load/store pairs across calls.
+      if (MI.isCall())
+        updateBlockRefAffinities();
+
+      if (!TII.isPairableLdStInst(MI))
+        continue;
+
+      const MachineOperand &Addr = MI.getOperand(1);
+      // Check to see if it's a local stack symbol.
+      if (!Addr.isFI())
+        continue;
+
+      int Index = Addr.getIndex();
+      // Check to see if it falls within our range, and is tagged
+      // to require ordering.
+      if (Index < 0 || Index >= MFI.getObjectIndexEnd() ||
+          ObjectsToAllocSize[Index] == 0)
+        continue;
+
+      if (!TII.isCandidateToMergeOrPair(MI))
+        continue;
+
+      unsigned LoadStIdx = MI.mayLoad() ? 0 : 1;
+
+      int Reg = MI.getOperand(0).getReg();
+      unsigned RegClassIdx;
+      if (AArch64::GPR32allRegClass.contains(Reg))
+        RegClassIdx = 0;
+      else if (AArch64::GPR64allRegClass.contains(Reg))
+        RegClassIdx = 1;
+      else if (AArch64::FPR32RegClass.contains(Reg))
+        RegClassIdx = 2;
+      else if (AArch64::FPR64RegClass.contains(Reg))
+        RegClassIdx = 3;
+      else if (AArch64::FPR128RegClass.contains(Reg))
+        RegClassIdx = 4;
+      else
+        // Skip non-pairable register classes.
+        continue;
+
+      assert(RegClassIdx < NumPairableRegClasses);
+      BlockRefs[LoadStIdx][RegClassIdx].push_back(Index);
+    }
+
+    updateBlockRefAffinities();
+  }
+
+  // Sort affinity graph edges by weight and place them in SortedAffinities.
+  using Affinity = std::pair<IdxPair, uint64_t>;
+  std::vector<Affinity> SortedAffinities;
+  for (auto &A : Affinities)
+    SortedAffinities.emplace_back(A);
+  std::sort(SortedAffinities.begin(), SortedAffinities.end(),
+            [](const Affinity &A1, const Affinity &A2) {
+              uint64_t Weight1 = A1.second;
+              uint64_t Weight2 = A2.second;
+              if (Weight1 > Weight2)
+                return true;
+              if (Weight1 != Weight2)
+                return false;
+              const IdxPair &P1 = A1.first;
+              const IdxPair &P2 = A2.first;
+              int A1Dist = P1.second - P1.first;
+              int A2Dist = P2.second - P2.first;
+              // Break ties by putting affinities that are closer to the
+              // original order first.
+              // Final tie break based on ordering of first index in pair to
+              // ensure stability across different runs.
+              return std::tie(A1Dist, P1.first) < std::tie(A2Dist, P2.first);
+            });
+
+  // Accumulate "clumps" of stack slots, which are ordered sub-lists of stack
+  // slots that are ordered to honor the affinity graph edges, from highest
+  // weight to lowest.
+  typedef std::deque<int> StackSlotClump;
+  SmallVector<StackSlotClump, 16> Clumps;
+  // ClumpMap[SlotIdx] = clump index (i.e. index of clump in Clumps vector)
+  SmallVector<int, 16> ClumpMap(static_cast<size_t>(MFI.getObjectIndexEnd()),
+                                -1);
+
+  // Create a new Clump consisting of Slot1 and Slot2.
+  auto addNewClump = [&](int Slot1, int Slot2) {
+    int ClumpIdx = Clumps.size();
+    Clumps.emplace_back(StackSlotClump());
+    Clumps.back().push_back(Slot1);
+    Clumps.back().push_back(Slot2);
+    ClumpMap[Slot1] = ClumpIdx;
+    ClumpMap[Slot2] = ClumpIdx;
+  };
+
+  // Try to add Slot2 to Clump1 if it can be placed next to Slot1 at either end.
+  auto tryAttachTo = [&](int Clump1, int Slot1, int Slot2) {
+    auto &Slots = Clumps[Clump1];
+    if (Slots.back() == Slot1) {
+      Slots.push_back(Slot2);
+      ClumpMap[Slot2] = Clump1;
+    } else if (Slots.front() == Slot1) {
+      Slots.push_front(Slot2);
+      ClumpMap[Slot2] = Clump1;
+    }
+  };
+
+  // Try to merge two Clumps such that Slot1 and Slot2 will be adjacent in the
+  // new merged Clump.
+  auto tryMerge = [&](int Clump1, int Slot1, int Clump2, int Slot2) {
+    assert(Clump1 != Clump2);
+
+    auto &Slots1 = Clumps[Clump1];
+    auto &Slots2 = Clumps[Clump2];
+
+    bool Merged = false;
+    if (Slots1.back() == Slot1) {
+      if (Slots2.front() == Slot2) {
+        Slots1.insert(Slots1.end(), Slots2.begin(), Slots2.end());
+        Merged = true;
+      } else if (Slots2.back() == Slot2) {
+        Slots1.insert(Slots1.end(), Slots2.rbegin(), Slots2.rend());
+        Merged = true;
+      }
+    } else if (Slots1.front() == Slot1) {
+      if (Slots2.front() == Slot2) {
+        Slots1.insert(Slots1.begin(), Slots2.rbegin(), Slots2.rend());
+        Merged = true;
+      } else if (Slots2.back() == Slot2) {
+        Slots1.insert(Slots1.begin(), Slots2.begin(), Slots2.end());
+        Merged = true;
+      }
+    }
+
+    if (Merged) {
+      for (int S2 : Slots2)
+        ClumpMap[S2] = Clump1;
+      Slots2.clear();
+    }
+  };
+
+  // Build clumps.
+  for (auto &A : SortedAffinities) {
+    int Idx1 = A.first.first;
+    int Idx2 = A.first.second;
+
+    int Clump1 = ClumpMap[Idx1];
+    int Clump2 = ClumpMap[Idx2];
+
+    if (Clump1 < 0 && Clump2 < 0)
+      addNewClump(Idx1, Idx2);
+    else if (Clump1 < 0)
+      tryAttachTo(Clump2, Idx2, Idx1);
+    else if (Clump2 < 0)
+      tryAttachTo(Clump1, Idx1, Idx2);
+    else if (Clump1 != Clump2)
+      tryMerge(Clump1, Idx1, Clump2, Idx2);
+  }
+
+  // Determine final order.  Non-clump slots are kept in their initial order.
+  // Members of clumps are kept in clump order, starting at the earliest point
+  // in the original list of any of their members.
+  SmallVector<int, 8> AllocationOrder;
+  AllocationOrder.reserve(ObjectsToAllocate.size());
+  for (int SIdx : ObjectsToAllocate) {
+    int CIdx = ClumpMap[SIdx];
+    if (CIdx < 0)
+      AllocationOrder.push_back(SIdx);
+    else {
+      auto &Slots = Clumps[CIdx];
+      AllocationOrder.insert(AllocationOrder.end(), Slots.begin(), Slots.end());
+      Slots.clear();
+    }
+  }
+
+  std::copy(AllocationOrder.begin(), AllocationOrder.end(),
+            ObjectsToAllocate.begin());
+}
Index: lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.h
+++ lib/Target/AArch64/AArch64InstrInfo.h
@@ -120,7 +120,7 @@
   }
 
   /// Return true if this is a load/store that can be potentially paired/merged.
-  bool isCandidateToMergeOrPair(MachineInstr &MI) const;
+  bool isCandidateToMergeOrPair(const MachineInstr &MI) const;
 
   /// Hint that pairing the given load or store is unprofitable.
   void suppressLdStPair(MachineInstr &MI) const;
Index: lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.cpp
+++ lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1629,22 +1629,27 @@
 
 // Is this a candidate for ld/st merging or pairing?  For example, we don't
 // touch volatiles or load/stores that have a hint to avoid pair formation.
-bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
+
   // If this is a volatile load/store, don't mess with it.
   if (MI.hasOrderedMemoryRef())
     return false;
 
   // Make sure this is a reg+imm (as opposed to an address reloc).
-  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  const MachineOperand &Base = MI.getOperand(1);
+  assert((Base.isReg() || Base.isFI()) && "Expected a reg operand.");
   if (!MI.getOperand(2).isImm())
     return false;
 
   // Can't merge/pair if the instruction modifies the base register.
   // e.g., ldr x0, [x0]
-  unsigned BaseReg = MI.getOperand(1).getReg();
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  if (MI.modifiesRegister(BaseReg, TRI))
-    return false;
+  // Don't need to check this if the base is e.g. a frame index.
+  if (Base.isReg()) {
+    unsigned BaseReg = Base.getReg();
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    if (MI.modifiesRegister(BaseReg, TRI))
+      return false;
+  }
 
   // Check if this load/store has a hint to avoid pair formation.
   // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
Index: test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
===================================================================
--- test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone -stack-symbol-ordering=0 < %s | FileCheck %s
 
 ; CHECK: foo
 ; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
Index: test/CodeGen/AArch64/stack-layout-pairing.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/stack-layout-pairing.ll
@@ -0,0 +1,376 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+
+declare void @bar(i32*, i32*, i32*, i32*)
+declare void @barv(<4 x i32>*, <4 x i32>*, <4 x i32>*, <4 x i32>*)
+declare void @foo(i32, i32)
+declare void @foof(float, float)
+declare void @foov(<4 x i32>, <4 x i32>)
+
+
+; Check that %x1/%x2 and %y1/%y2 get allocated to adjacent stack
+; locations and loaded using load pair instructions.
+define void @test1(i1 %c) {
+; CHECK-LABEL: test1:
+entry:
+  %x1 = alloca i32, align 4
+  %y1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %y2 = alloca i32, align 4
+  call void @bar(i32* %x1, i32* %y1, i32* %x2, i32* %y2)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w1, w0, [sp
+  %v4 = load i32, i32* %x1, align 4
+  %v5 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v6 = load i32, i32* %y1, align 4
+  %v7 = load i32, i32* %y2, align 4
+  call void @foo(i32 %v6, i32 %v7)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+
+; Check that %x1/%y1 and %x1/%x2 get allocated to adjacent stack locations and
+; loaded using load pair instructions.
+define void @test2(i32 %c) {
+; CHECK-LABEL: test2:
+entry:
+  %x1 = alloca i32, align 4
+  %y1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %y2 = alloca i32, align 4
+  call void @bar(i32* %x1, i32* %y1, i32* %x2, i32* %y2)
+  switch i32 %c, label %while.cond.preheader [
+    i32 1, label %while.cond.preheader.thread1
+    i32 2, label %while.cond.preheader.thread2
+  ]
+
+while.cond.preheader.thread1:
+; CHECK-LABEL: %while.cond.preheader.thread1
+; CHECK-NEXT: ldp	[[REG1:w[0-9]+]], [[REG2:w[0-9]+]], [sp
+; CHECK-NEXT: bl	foo
+  %v4 = load i32, i32* %x1, align 4
+  %v5 = load i32, i32* %y1, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  br label %while.end
+
+while.cond.preheader.thread2:
+; CHECK-LABEL: %while.cond.preheader.thread2
+; CHECK-NEXT: ldp	[[REG2]], [[REG1]], [sp
+; CHECK-NEXT: bl	foo
+  %v6 = load i32, i32* %x1, align 4
+  %v7 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v6, i32 %v7)
+  br label %while.body.preheader
+
+while.cond.preheader:
+  %dec6 = add nsw i32 %c, -1
+  %tobool7 = icmp eq i32 %dec6, 0
+  br i1 %tobool7, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  %dec8.ph = phi i32 [ 1, %while.cond.preheader.thread2 ], [ %dec6, %while.cond.preheader ]
+  br label %while.body
+
+while.body:
+  %dec8 = phi i32 [ %dec, %while.body ], [ %dec8.ph, %while.body.preheader ]
+  %v8 = load i32, i32* %x1, align 4
+  %v9 = load i32, i32* %y2, align 4
+  call void @foo(i32 %v8, i32 %v9)
+  %dec = add nsw i32 %dec8, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:
+  ret void
+}
+
+; Check that large alloca doesn't prevent %x1/%x2 loads from getting
+; paired.
+define void @test3() {
+; CHECK-LABEL: test3:
+entry:
+  %x1 = alloca i32, align 4
+  %y = alloca i32, i32 8, align 4
+  %x2 = alloca i32, align 4
+  call void @bar(i32* %x1, i32* %y, i32* %x2, i32* %x2)
+; CHECK:	ldp		w1, w0, [sp
+  %vx1 = load i32, i32* %x1, align 4
+  %vx2 = load i32, i32* %x2, align 4
+  %p = getelementptr i32, i32* %y, i32 3
+  %vy = load i32, i32* %p, align 4
+  call void @foo(i32 %vx1, i32 %vx2)
+  call void @foo(i32 %vy, i32 %vy)
+  ret void
+}
+
+; Check that different register classes are laid out seperately to form two load pairs.
+; paired.
+define void @test4() {
+; CHECK-LABEL: test4:
+entry:
+  %x1 = alloca i32, align 4
+  %y1 = alloca float, align 4
+  %x2 = alloca i32, align 4
+  %y2 = alloca float, align 4
+  %y1i = bitcast float* %y1 to i32*
+  %y2i = bitcast float* %y2 to i32*
+  call void @bar(i32* %x1, i32* %y1i, i32* %x2, i32* %y2i)
+; CHECK:	ldp		w1, w0, [sp
+  %vx1 = load i32, i32* %x1, align 4
+  %vx2 = load i32, i32* %x2, align 4
+; CHECK:	ldp		s{{[0-9]+}}, s{{[0-9]+}}, [sp
+  %vy1 = load float, float* %y1, align 4
+  %vy2 = load float, float* %y2, align 4
+  call void @foo(i32 %vx1, i32 %vx2)
+  call void @foof(float %vy1, float %vy2)
+  ret void
+}
+
+; Check that volatile-ness is taken into account when ordering (since
+; we can't pair volatile loads.
+define void @test5() {
+; CHECK-LABEL: test5:
+entry:
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x1, i32* %x2, i32* %x3, i32* %x3)
+; CHECK:	ldp		w{{[0-9]+}}, w0, [sp
+  %vx1 = load i32, i32* %x1, align 4
+  %vx2 = load volatile i32, i32* %x2, align 4
+  %vx3 = load i32, i32* %x3, align 4
+  call void @foo(i32 %vx1, i32 %vx2)
+  call void @foo(i32 %vx3, i32 %vx3)
+  ret void
+}
+
+; Check that ties are broken by preserving the original stack layout
+; (i.e. lexical order).
+define void @test6() {
+; CHECK-LABEL: test6:
+entry:
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x1, i32* %x2, i32* %x3, i32* %x3)
+; CHECK:	ldp		w{{[0-9]+}}, w1, [sp, #4
+; CHECK:	ldr		w0, [sp, #12
+  %vx3 = load i32, i32* %x3, align 4
+  %vx2 = load i32, i32* %x2, align 4
+  %vx1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %vx1, i32 %vx2)
+  call void @foo(i32 %vx3, i32 %vx3)
+  ret void
+}
+
+; The following few tests exercise the clump formation code in
+; AArch64FrameLowering::orderFrameObjects.
+
+; Attach clump 2-1.
+define void @test7(i1 %c) {
+; CHECK-LABEL: test7:
+entry:
+  %x0 = alloca i32, align 4
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w1, w0, [sp
+  %v0 = load i32, i32* %x0, align 4
+  %v1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %v0, i32 %v1)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v2 = load i32, i32* %x1, align 4
+  %v3 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v2, i32 %v3)
+; CHECK:	ldp		w1, w0, [sp
+  %v2_2 = load i32, i32* %x1, align 4
+  %v3_2 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v2_2, i32 %v3_2)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Merge clumps 1-2.
+define void @test8(i1 %c) {
+; CHECK-LABEL: test8:
+entry:
+  %x0 = alloca i32, align 4
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w1, w0, [sp
+  %v0 = load i32, i32* %x0, align 4
+  %v1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %v0, i32 %v1)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v2 = load i32, i32* %x2, align 4
+  %v3 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v2, i32 %v3)
+; CHECK:	ldp		w1, w0, [sp
+  %v2_2 = load i32, i32* %x2, align 4
+  %v3_2 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v2_2, i32 %v3_2)
+  br label %if.end
+
+if.end:
+; CHECK:	ldp		w1, w0, [sp
+  %v4 = load i32, i32* %x1, align 4
+  %v5 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  ret void
+}
+
+
+; Merge clumps 1-rev(2).
+define void @test9(i1 %c) {
+; CHECK-LABEL: test9:
+entry:
+  %x0 = alloca i32, align 4
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w0, w1, [sp
+  %v0 = load i32, i32* %x0, align 4
+  %v1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %v0, i32 %v1)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v2 = load i32, i32* %x2, align 4
+  %v3 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v2, i32 %v3)
+  br label %if.end
+
+if.end:
+; CHECK:	ldp		w1, w0, [sp
+  %v4 = load i32, i32* %x1, align 4
+  %v5 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  ret void
+}
+
+; Merge clumps 2-1.
+define void @test10(i1 %c) {
+; CHECK-LABEL: test10:
+entry:
+  %x0 = alloca i32, align 4
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w0, w1, [sp
+  %v0 = load i32, i32* %x0, align 4
+  %v1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %v0, i32 %v1)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v2 = load i32, i32* %x2, align 4
+  %v3 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v2, i32 %v3)
+  br label %if.end
+
+if.end:
+; CHECK:	ldp		w0, w1, [sp
+  %v4 = load i32, i32* %x0, align 4
+  %v5 = load i32, i32* %x2, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  ret void
+}
+
+; Merge clumps 2-rev(1).
+define void @test11(i1 %c) {
+; CHECK-LABEL: test11:
+entry:
+  %x0 = alloca i32, align 4
+  %x1 = alloca i32, align 4
+  %x2 = alloca i32, align 4
+  %x3 = alloca i32, align 4
+  call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		w1, w0, [sp
+  %v0 = load i32, i32* %x0, align 4
+  %v1 = load i32, i32* %x1, align 4
+  call void @foo(i32 %v0, i32 %v1)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		w1, w0, [sp
+  %v2 = load i32, i32* %x2, align 4
+  %v3 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v2, i32 %v3)
+  br label %if.end
+
+if.end:
+; CHECK:	ldp		w0, w1, [sp
+  %v4 = load i32, i32* %x0, align 4
+  %v5 = load i32, i32* %x3, align 4
+  call void @foo(i32 %v4, i32 %v5)
+  ret void
+}
+
+
+; Test vector type pairing.
+define void @test12(i1 %c) {
+; CHECK-LABEL: test12:
+entry:
+  %x1 = alloca <4 x i32>, align 16
+  %y1 = alloca <4 x i32>, align 16
+  %x2 = alloca <4 x i32>, align 16
+  %y2 = alloca <4 x i32>, align 16
+  call void @barv(<4 x i32>* %x1, <4 x i32>* %y1, <4 x i32>* %x2, <4 x i32>* %y2)
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+; CHECK:	ldp		q1, q0, [sp
+  %v4 = load <4 x i32>, <4 x i32>* %x1, align 4
+  %v5 = load <4 x i32>, <4 x i32>* %x2, align 4
+  call void @foov(<4 x i32> %v4, <4 x i32> %v5)
+  br label %if.end
+
+if.else:
+; CHECK:	ldp		q1, q0, [sp
+  %v6 = load <4 x i32>, <4 x i32>* %y1, align 4
+  %v7 = load <4 x i32>, <4 x i32>* %y2, align 4
+  call void @foov(<4 x i32> %v6, <4 x i32> %v7)
+  br label %if.end
+
+if.end:
+  ret void
+}