Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -69,6 +69,11 @@ bool enableStackSlotScavenging(const MachineFunction &MF) const override; + /// Order the symbols in the local stack in an attempt to create more paired + /// load/store instructions. + void orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &ObjectsToAllocate) const override; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -105,6 +105,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include using namespace llvm; @@ -1198,3 +1199,250 @@ const AArch64FunctionInfo *AFI = MF.getInfo(); return AFI->hasCalleeSaveStackFreeSpace(); } + +// Order stack objects so that more load/store instructions referencing them can +// be merged into load/store pair instructions. +void AArch64FrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { + + // Don't waste time if there's nothing to do. + if (ObjectsToAllocate.empty()) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + auto &TII = *static_cast(Subtarget.getInstrInfo()); + + // Quick access to object size and whether object was in initial + // ObjectsToAllocate array. Only objects with a non-zero value in this array + // are of interest in this function. + SmallVector ObjectsToAllocSize( + static_cast(MFI.getObjectIndexEnd()), 0); + for (auto &Obj : ObjectsToAllocate) + ObjectsToAllocSize[Obj] = MFI.getObjectSize(Obj); + + // Canonicalized pair to be sorted for use in non-directed graph edge key. + struct IdxPair : public std::pair { + IdxPair(int Idx1, int Idx2) : std::pair(Idx1, Idx2) { + if (Idx1 > Idx2) + std::swap(first, second); + } + }; + // Weighted non-directed graph of stack slot pair affinities. Weights + // indicate how frequently the pair of stack slots are referenced in the same + // block region. + std::map Affinities; + + // Indexed by Load/Store, regclass. BlockRefs[L/S][RegClass] is a list of the + // load/store references to/from the specified regclass in a given block + // region. + const unsigned NumPairableRegClasses = 5; + SmallVector BlockRefs[2][NumPairableRegClasses]; + + auto updateBlockRefAffinitiesWeights = [&](SmallVectorImpl &BlockRefs) { + auto N = BlockRefs.size(); + for (unsigned I = 0; I < N; ++I) { + int IIdx = BlockRefs[I]; + unsigned ISize = ObjectsToAllocSize[IIdx]; + // Only relatively small sized slots are likely load/store pair + // candidates. + if (ISize == 0 || ISize > 16) + continue; + for (unsigned J = I + 1; J < N; ++J) { + int JIdx = BlockRefs[J]; + if (IIdx != JIdx && + // Only same sized slots are likely load/store pair candidates. + ISize == ObjectsToAllocSize[JIdx]) + Affinities[IdxPair(IIdx, JIdx)]++; + } + } + BlockRefs.clear(); + }; + + // Update the affinity graph. For all pairs of potentially pair-able slots + // referenced in the same block, increment the weight of the edge between + // them. + auto updateBlockRefAffinities = [&]() { + for (unsigned LoadStIdx = 0; LoadStIdx < 2; ++LoadStIdx) + for (unsigned RegClassIdx = 0; RegClassIdx < NumPairableRegClasses; + ++RegClassIdx) + updateBlockRefAffinitiesWeights(BlockRefs[LoadStIdx][RegClassIdx]); + }; + + // Build the weighted affinity graph by looking for pair-able accesses. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + // Treat calls as barriers chopping block into two separate block regions + // since we can't form load/store pairs across calls. + if (MI.isCall()) + updateBlockRefAffinities(); + + if (!TII.isPairableLdStInst(MI)) + continue; + + const MachineOperand &Addr = MI.getOperand(1); + // Check to see if it's a local stack symbol. + if (!Addr.isFI()) + continue; + + int Index = Addr.getIndex(); + // Check to see if it falls within our range, and is tagged + // to require ordering. + if (Index < 0 || Index >= MFI.getObjectIndexEnd() || + ObjectsToAllocSize[Index] == 0) + continue; + + if (!TII.isCandidateToMergeOrPair(MI)) + continue; + + unsigned LoadStIdx = MI.mayLoad() ? 0 : 1; + + int Reg = MI.getOperand(0).getReg(); + unsigned RegClassIdx; + if (AArch64::GPR32allRegClass.contains(Reg)) + RegClassIdx = 0; + else if (AArch64::GPR64allRegClass.contains(Reg)) + RegClassIdx = 1; + else if (AArch64::FPR32RegClass.contains(Reg)) + RegClassIdx = 2; + else if (AArch64::FPR64RegClass.contains(Reg)) + RegClassIdx = 3; + else if (AArch64::FPR128RegClass.contains(Reg)) + RegClassIdx = 4; + else + // Skip non-pairable register classes. + continue; + + assert(RegClassIdx < NumPairableRegClasses); + BlockRefs[LoadStIdx][RegClassIdx].push_back(Index); + } + + updateBlockRefAffinities(); + } + + // Sort affinity graph edges by weight and place them in SortedAffinities. + using Affinity = std::pair; + std::vector SortedAffinities; + for (auto &A : Affinities) + SortedAffinities.emplace_back(A); + std::sort(SortedAffinities.begin(), SortedAffinities.end(), + [](const Affinity &A1, const Affinity &A2) { + uint64_t Weight1 = A1.second; + uint64_t Weight2 = A2.second; + if (Weight1 > Weight2) + return true; + if (Weight1 != Weight2) + return false; + const IdxPair &P1 = A1.first; + const IdxPair &P2 = A2.first; + int A1Dist = P1.second - P1.first; + int A2Dist = P2.second - P2.first; + // Break ties by putting affinities that are closer to the + // original order first. + // Final tie break based on ordering of first index in pair to + // ensure stability across different runs. + return std::tie(A1Dist, P1.first) < std::tie(A2Dist, P2.first); + }); + + // Accumulate "clumps" of stack slots, which are ordered sub-lists of stack + // slots that are ordered to honor the affinity graph edges, from highest + // weight to lowest. + typedef std::deque StackSlotClump; + SmallVector Clumps; + // ClumpMap[SlotIdx] = clump index (i.e. index of clump in Clumps vector) + SmallVector ClumpMap(static_cast(MFI.getObjectIndexEnd()), + -1); + + // Create a new Clump consisting of Slot1 and Slot2. + auto addNewClump = [&](int Slot1, int Slot2) { + int ClumpIdx = Clumps.size(); + Clumps.emplace_back(StackSlotClump()); + Clumps.back().push_back(Slot1); + Clumps.back().push_back(Slot2); + ClumpMap[Slot1] = ClumpIdx; + ClumpMap[Slot2] = ClumpIdx; + }; + + // Try to add Slot2 to Clump1 if it can be placed next to Slot1 at either end. + auto tryAttachTo = [&](int Clump1, int Slot1, int Slot2) { + auto &Slots = Clumps[Clump1]; + if (Slots.back() == Slot1) { + Slots.push_back(Slot2); + ClumpMap[Slot2] = Clump1; + } else if (Slots.front() == Slot1) { + Slots.push_front(Slot2); + ClumpMap[Slot2] = Clump1; + } + }; + + // Try to merge two Clumps such that Slot1 and Slot2 will be adjacent in the + // new merged Clump. + auto tryMerge = [&](int Clump1, int Slot1, int Clump2, int Slot2) { + assert(Clump1 != Clump2); + + auto &Slots1 = Clumps[Clump1]; + auto &Slots2 = Clumps[Clump2]; + + bool Merged = false; + if (Slots1.back() == Slot1) { + if (Slots2.front() == Slot2) { + Slots1.insert(Slots1.end(), Slots2.begin(), Slots2.end()); + Merged = true; + } else if (Slots2.back() == Slot2) { + Slots1.insert(Slots1.end(), Slots2.rbegin(), Slots2.rend()); + Merged = true; + } + } else if (Slots1.front() == Slot1) { + if (Slots2.front() == Slot2) { + Slots1.insert(Slots1.begin(), Slots2.rbegin(), Slots2.rend()); + Merged = true; + } else if (Slots2.back() == Slot2) { + Slots1.insert(Slots1.begin(), Slots2.begin(), Slots2.end()); + Merged = true; + } + } + + if (Merged) { + for (int S2 : Slots2) + ClumpMap[S2] = Clump1; + Slots2.clear(); + } + }; + + // Build clumps. + for (auto &A : SortedAffinities) { + int Idx1 = A.first.first; + int Idx2 = A.first.second; + + int Clump1 = ClumpMap[Idx1]; + int Clump2 = ClumpMap[Idx2]; + + if (Clump1 < 0 && Clump2 < 0) + addNewClump(Idx1, Idx2); + else if (Clump1 < 0) + tryAttachTo(Clump2, Idx2, Idx1); + else if (Clump2 < 0) + tryAttachTo(Clump1, Idx1, Idx2); + else if (Clump1 != Clump2) + tryMerge(Clump1, Idx1, Clump2, Idx2); + } + + // Determine final order. Non-clump slots are kept in their initial order. + // Members of clumps are kept in clump order, starting at the earliest point + // in the original list of any of their members. + SmallVector AllocationOrder; + AllocationOrder.reserve(ObjectsToAllocate.size()); + for (int SIdx : ObjectsToAllocate) { + int CIdx = ClumpMap[SIdx]; + if (CIdx < 0) + AllocationOrder.push_back(SIdx); + else { + auto &Slots = Clumps[CIdx]; + AllocationOrder.insert(AllocationOrder.end(), Slots.begin(), Slots.end()); + Slots.clear(); + } + } + + std::copy(AllocationOrder.begin(), AllocationOrder.end(), + ObjectsToAllocate.begin()); +} Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -120,7 +120,7 @@ } /// Return true if this is a load/store that can be potentially paired/merged. - bool isCandidateToMergeOrPair(MachineInstr &MI) const; + bool isCandidateToMergeOrPair(const MachineInstr &MI) const; /// Hint that pairing the given load or store is unprofitable. void suppressLdStPair(MachineInstr &MI) const; Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1629,22 +1629,27 @@ // Is this a candidate for ld/st merging or pairing? For example, we don't // touch volatiles or load/stores that have a hint to avoid pair formation. -bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { +bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { + // If this is a volatile load/store, don't mess with it. if (MI.hasOrderedMemoryRef()) return false; // Make sure this is a reg+imm (as opposed to an address reloc). - assert(MI.getOperand(1).isReg() && "Expected a reg operand."); + const MachineOperand &Base = MI.getOperand(1); + assert((Base.isReg() || Base.isFI()) && "Expected a reg operand."); if (!MI.getOperand(2).isImm()) return false; // Can't merge/pair if the instruction modifies the base register. // e.g., ldr x0, [x0] - unsigned BaseReg = MI.getOperand(1).getReg(); - const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (MI.modifiesRegister(BaseReg, TRI)) - return false; + // Don't need to check this if the base is e.g. a frame index. + if (Base.isReg()) { + unsigned BaseReg = Base.getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (MI.modifiesRegister(BaseReg, TRI)) + return false; + } // Check if this load/store has a hint to avoid pair formation. // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. Index: test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll =================================================================== --- test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll +++ test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s +; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone -stack-symbol-ordering=0 < %s | FileCheck %s ; CHECK: foo ; CHECK: str w[[REG0:[0-9]+]], [x19, #264] Index: test/CodeGen/AArch64/stack-layout-pairing.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/stack-layout-pairing.ll @@ -0,0 +1,376 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s + +declare void @bar(i32*, i32*, i32*, i32*) +declare void @barv(<4 x i32>*, <4 x i32>*, <4 x i32>*, <4 x i32>*) +declare void @foo(i32, i32) +declare void @foof(float, float) +declare void @foov(<4 x i32>, <4 x i32>) + + +; Check that %x1/%x2 and %y1/%y2 get allocated to adjacent stack +; locations and loaded using load pair instructions. +define void @test1(i1 %c) { +; CHECK-LABEL: test1: +entry: + %x1 = alloca i32, align 4 + %y1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %y2 = alloca i32, align 4 + call void @bar(i32* %x1, i32* %y1, i32* %x2, i32* %y2) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w1, w0, [sp + %v4 = load i32, i32* %x1, align 4 + %v5 = load i32, i32* %x2, align 4 + call void @foo(i32 %v4, i32 %v5) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v6 = load i32, i32* %y1, align 4 + %v7 = load i32, i32* %y2, align 4 + call void @foo(i32 %v6, i32 %v7) + br label %if.end + +if.end: + ret void +} + + +; Check that %x1/%y1 and %x1/%x2 get allocated to adjacent stack locations and +; loaded using load pair instructions. +define void @test2(i32 %c) { +; CHECK-LABEL: test2: +entry: + %x1 = alloca i32, align 4 + %y1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %y2 = alloca i32, align 4 + call void @bar(i32* %x1, i32* %y1, i32* %x2, i32* %y2) + switch i32 %c, label %while.cond.preheader [ + i32 1, label %while.cond.preheader.thread1 + i32 2, label %while.cond.preheader.thread2 + ] + +while.cond.preheader.thread1: +; CHECK-LABEL: %while.cond.preheader.thread1 +; CHECK-NEXT: ldp [[REG1:w[0-9]+]], [[REG2:w[0-9]+]], [sp +; CHECK-NEXT: bl foo + %v4 = load i32, i32* %x1, align 4 + %v5 = load i32, i32* %y1, align 4 + call void @foo(i32 %v4, i32 %v5) + br label %while.end + +while.cond.preheader.thread2: +; CHECK-LABEL: %while.cond.preheader.thread2 +; CHECK-NEXT: ldp [[REG2]], [[REG1]], [sp +; CHECK-NEXT: bl foo + %v6 = load i32, i32* %x1, align 4 + %v7 = load i32, i32* %x2, align 4 + call void @foo(i32 %v6, i32 %v7) + br label %while.body.preheader + +while.cond.preheader: + %dec6 = add nsw i32 %c, -1 + %tobool7 = icmp eq i32 %dec6, 0 + br i1 %tobool7, label %while.end, label %while.body.preheader + +while.body.preheader: + %dec8.ph = phi i32 [ 1, %while.cond.preheader.thread2 ], [ %dec6, %while.cond.preheader ] + br label %while.body + +while.body: + %dec8 = phi i32 [ %dec, %while.body ], [ %dec8.ph, %while.body.preheader ] + %v8 = load i32, i32* %x1, align 4 + %v9 = load i32, i32* %y2, align 4 + call void @foo(i32 %v8, i32 %v9) + %dec = add nsw i32 %dec8, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: + ret void +} + +; Check that large alloca doesn't prevent %x1/%x2 loads from getting +; paired. +define void @test3() { +; CHECK-LABEL: test3: +entry: + %x1 = alloca i32, align 4 + %y = alloca i32, i32 8, align 4 + %x2 = alloca i32, align 4 + call void @bar(i32* %x1, i32* %y, i32* %x2, i32* %x2) +; CHECK: ldp w1, w0, [sp + %vx1 = load i32, i32* %x1, align 4 + %vx2 = load i32, i32* %x2, align 4 + %p = getelementptr i32, i32* %y, i32 3 + %vy = load i32, i32* %p, align 4 + call void @foo(i32 %vx1, i32 %vx2) + call void @foo(i32 %vy, i32 %vy) + ret void +} + +; Check that different register classes are laid out seperately to form two load pairs. +; paired. +define void @test4() { +; CHECK-LABEL: test4: +entry: + %x1 = alloca i32, align 4 + %y1 = alloca float, align 4 + %x2 = alloca i32, align 4 + %y2 = alloca float, align 4 + %y1i = bitcast float* %y1 to i32* + %y2i = bitcast float* %y2 to i32* + call void @bar(i32* %x1, i32* %y1i, i32* %x2, i32* %y2i) +; CHECK: ldp w1, w0, [sp + %vx1 = load i32, i32* %x1, align 4 + %vx2 = load i32, i32* %x2, align 4 +; CHECK: ldp s{{[0-9]+}}, s{{[0-9]+}}, [sp + %vy1 = load float, float* %y1, align 4 + %vy2 = load float, float* %y2, align 4 + call void @foo(i32 %vx1, i32 %vx2) + call void @foof(float %vy1, float %vy2) + ret void +} + +; Check that volatile-ness is taken into account when ordering (since +; we can't pair volatile loads. +define void @test5() { +; CHECK-LABEL: test5: +entry: + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x1, i32* %x2, i32* %x3, i32* %x3) +; CHECK: ldp w{{[0-9]+}}, w0, [sp + %vx1 = load i32, i32* %x1, align 4 + %vx2 = load volatile i32, i32* %x2, align 4 + %vx3 = load i32, i32* %x3, align 4 + call void @foo(i32 %vx1, i32 %vx2) + call void @foo(i32 %vx3, i32 %vx3) + ret void +} + +; Check that ties are broken by preserving the original stack layout +; (i.e. lexical order). +define void @test6() { +; CHECK-LABEL: test6: +entry: + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x1, i32* %x2, i32* %x3, i32* %x3) +; CHECK: ldp w{{[0-9]+}}, w1, [sp, #4 +; CHECK: ldr w0, [sp, #12 + %vx3 = load i32, i32* %x3, align 4 + %vx2 = load i32, i32* %x2, align 4 + %vx1 = load i32, i32* %x1, align 4 + call void @foo(i32 %vx1, i32 %vx2) + call void @foo(i32 %vx3, i32 %vx3) + ret void +} + +; The following few tests exercise the clump formation code in +; AArch64FrameLowering::orderFrameObjects. + +; Attach clump 2-1. +define void @test7(i1 %c) { +; CHECK-LABEL: test7: +entry: + %x0 = alloca i32, align 4 + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w1, w0, [sp + %v0 = load i32, i32* %x0, align 4 + %v1 = load i32, i32* %x1, align 4 + call void @foo(i32 %v0, i32 %v1) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v2 = load i32, i32* %x1, align 4 + %v3 = load i32, i32* %x2, align 4 + call void @foo(i32 %v2, i32 %v3) +; CHECK: ldp w1, w0, [sp + %v2_2 = load i32, i32* %x1, align 4 + %v3_2 = load i32, i32* %x2, align 4 + call void @foo(i32 %v2_2, i32 %v3_2) + br label %if.end + +if.end: + ret void +} + +; Merge clumps 1-2. +define void @test8(i1 %c) { +; CHECK-LABEL: test8: +entry: + %x0 = alloca i32, align 4 + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w1, w0, [sp + %v0 = load i32, i32* %x0, align 4 + %v1 = load i32, i32* %x1, align 4 + call void @foo(i32 %v0, i32 %v1) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v2 = load i32, i32* %x2, align 4 + %v3 = load i32, i32* %x3, align 4 + call void @foo(i32 %v2, i32 %v3) +; CHECK: ldp w1, w0, [sp + %v2_2 = load i32, i32* %x2, align 4 + %v3_2 = load i32, i32* %x3, align 4 + call void @foo(i32 %v2_2, i32 %v3_2) + br label %if.end + +if.end: +; CHECK: ldp w1, w0, [sp + %v4 = load i32, i32* %x1, align 4 + %v5 = load i32, i32* %x2, align 4 + call void @foo(i32 %v4, i32 %v5) + ret void +} + + +; Merge clumps 1-rev(2). +define void @test9(i1 %c) { +; CHECK-LABEL: test9: +entry: + %x0 = alloca i32, align 4 + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w0, w1, [sp + %v0 = load i32, i32* %x0, align 4 + %v1 = load i32, i32* %x1, align 4 + call void @foo(i32 %v0, i32 %v1) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v2 = load i32, i32* %x2, align 4 + %v3 = load i32, i32* %x3, align 4 + call void @foo(i32 %v2, i32 %v3) + br label %if.end + +if.end: +; CHECK: ldp w1, w0, [sp + %v4 = load i32, i32* %x1, align 4 + %v5 = load i32, i32* %x3, align 4 + call void @foo(i32 %v4, i32 %v5) + ret void +} + +; Merge clumps 2-1. +define void @test10(i1 %c) { +; CHECK-LABEL: test10: +entry: + %x0 = alloca i32, align 4 + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w0, w1, [sp + %v0 = load i32, i32* %x0, align 4 + %v1 = load i32, i32* %x1, align 4 + call void @foo(i32 %v0, i32 %v1) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v2 = load i32, i32* %x2, align 4 + %v3 = load i32, i32* %x3, align 4 + call void @foo(i32 %v2, i32 %v3) + br label %if.end + +if.end: +; CHECK: ldp w0, w1, [sp + %v4 = load i32, i32* %x0, align 4 + %v5 = load i32, i32* %x2, align 4 + call void @foo(i32 %v4, i32 %v5) + ret void +} + +; Merge clumps 2-rev(1). +define void @test11(i1 %c) { +; CHECK-LABEL: test11: +entry: + %x0 = alloca i32, align 4 + %x1 = alloca i32, align 4 + %x2 = alloca i32, align 4 + %x3 = alloca i32, align 4 + call void @bar(i32* %x0, i32* %x1, i32* %x2, i32* %x3) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp w1, w0, [sp + %v0 = load i32, i32* %x0, align 4 + %v1 = load i32, i32* %x1, align 4 + call void @foo(i32 %v0, i32 %v1) + br label %if.end + +if.else: +; CHECK: ldp w1, w0, [sp + %v2 = load i32, i32* %x2, align 4 + %v3 = load i32, i32* %x3, align 4 + call void @foo(i32 %v2, i32 %v3) + br label %if.end + +if.end: +; CHECK: ldp w0, w1, [sp + %v4 = load i32, i32* %x0, align 4 + %v5 = load i32, i32* %x3, align 4 + call void @foo(i32 %v4, i32 %v5) + ret void +} + + +; Test vector type pairing. +define void @test12(i1 %c) { +; CHECK-LABEL: test12: +entry: + %x1 = alloca <4 x i32>, align 16 + %y1 = alloca <4 x i32>, align 16 + %x2 = alloca <4 x i32>, align 16 + %y2 = alloca <4 x i32>, align 16 + call void @barv(<4 x i32>* %x1, <4 x i32>* %y1, <4 x i32>* %x2, <4 x i32>* %y2) + br i1 %c, label %if.else, label %if.then + +if.then: +; CHECK: ldp q1, q0, [sp + %v4 = load <4 x i32>, <4 x i32>* %x1, align 4 + %v5 = load <4 x i32>, <4 x i32>* %x2, align 4 + call void @foov(<4 x i32> %v4, <4 x i32> %v5) + br label %if.end + +if.else: +; CHECK: ldp q1, q0, [sp + %v6 = load <4 x i32>, <4 x i32>* %y1, align 4 + %v7 = load <4 x i32>, <4 x i32>* %y2, align 4 + call void @foov(<4 x i32> %v6, <4 x i32> %v7) + br label %if.end + +if.end: + ret void +}