Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -61,6 +61,14 @@ /// ARMAllocLoadStoreOpt - Post- register allocation pass the combine /// load / store instructions to form ldm / stm instructions. +static cl::opt ScanLimit("arm-load-store-scan-limit", + cl::init(20), cl::Hidden); + +static cl::opt AlwaysCollapseToLoadStoreDouble("arm-load-store-use-ldrd-strd", + cl::Hidden, + cl::desc("Always try and collapse load/store pairs into ldrd/strd's if" \ + "available on target architecture"), cl::init(true)); + namespace { struct ARMLoadStoreOpt : public MachineFunctionPass { static char ID; @@ -139,8 +147,19 @@ MachineBasicBlock::iterator MBBI, bool &Advance, MachineBasicBlock::iterator &I); + // Merge the two instructions indicated into a single pair-wise instruction. + // If MergeForward is true, erase the first instruction and fold its + // operation into the second. If false, the reverse. Return the instruction + // following the first instruction (which may change during processing). + // -1 means none, 0 means I, and 1 means Paired. + MachineBasicBlock::iterator + mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, bool MergeForward); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); + bool LoadStoreToDoubleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, + bool &MergeForward, unsigned Limit); }; char ARMLoadStoreOpt::ID = 0; } @@ -371,6 +390,75 @@ return 4; } } +static unsigned getMatchingPairOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no pairwise equivalent!"); + case ARM::t2LDRi12: + return ARM::t2LDRDi8; + case ARM::t2STRi12: + return ARM::t2STRDi8; + } +} + +MachineBasicBlock::iterator +ARMLoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + bool MergeForward) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + unsigned Opc = I->getOpcode(); + int OffsetStride = 4; + + unsigned NewOpc = getMatchingPairOpcode(Opc); + // Insert our new paired instruction after whichever of the paired + // instructions MergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; + // Also based on MergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + MachineOperand &BaseRegOp = + MergeForward ? Paired->getOperand(1) : I->getOperand(1); + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (I->getOperand(2).getImm() == + Paired->getOperand(2).getImm() + OffsetStride) { + RtMI = Paired; + Rt2MI = I; + } else { + RtMI = I; + Rt2MI = Paired; + } + int OffsetImm = RtMI->getOperand(2).getImm(); + + // Construct the new instruction. + MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, + I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(RtMI->getOperand(0)) + .addOperand(Rt2MI->getOperand(0)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + AddDefaultPred(MIB); + + DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + + return NextI; +} /// Update future uses of the base register with the offset introduced /// due to writeback. This function only works on Thumb1. @@ -1590,6 +1678,198 @@ return false; } +/// trackRegDefsUses - Remember what registers the specified instruction uses +/// and modifies. +static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, + BitVector &UsedRegs, + const TargetRegisterInfo *TRI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isRegMask()) + ModifiedRegs.setBitsNotInMask(MO.getRegMask()); + + if (!MO.isReg() || MO.getReg() == 0) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef()) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + ModifiedRegs.set(*AI); + } else { + assert(MO.isUse() && "Reg operand not a def and not a use?!?"); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UsedRegs.set(*AI); + } + } +} + +/// findMatchingInsn - Scan the instructions looking for a load/store that can +/// be combined with the current instruction into a load/store pair. +MachineBasicBlock::iterator +ARMLoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, + bool &MergeForward, unsigned Limit) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + ++MBBI; + + int Opc = FirstMI->getOpcode(); + bool MayLoad = FirstMI->mayLoad(); + unsigned Reg = FirstMI->getOperand(0).getReg(); + unsigned BaseReg = FirstMI->getOperand(1).getReg(); + int Offset = FirstMI->getOperand(2).getImm(); + + // Early exit if the first instruction modifies the base register. + if (FirstMI->modifiesRegister(BaseReg, TRI)) + return E; + + int OffsetStride = 4; + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr *MI = MBBI; + + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + bool CanMergeOpc = Opc == MI->getOpcode(); + + if (CanMergeOpc && MI->getOperand(2).isImm()) { + // If we've found another instruction with the same opcode, check to see + // if the base and offset are compatible with our starting instruction. + unsigned MIBaseReg = MI->getOperand(1).getReg(); + int MIOffset = MI->getOperand(2).getImm(); + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || + (Offset + OffsetStride == MIOffset))) { + int MinOffset = Offset < MIOffset ? Offset : MIOffset; + // If this is a volatile load/store that otherwise matched, stop looking + // as something is going on that we don't have enough information to + // safely transform. Similarly, stop if we see a hint to avoid pairs. + if (MI->hasOrderedMemoryRef()) + return E; + + // If the destination register of the loads is the same register, bail + // and keep looking. A load-pair instruction with both destination + // registers the same is UNPREDICTABLE and will result in an exception. + if (MayLoad && Reg == MI->getOperand(0).getReg()) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + continue; + } + + // Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + if (STI->isCortexM3() && MI->modifiesRegister(BaseReg, TRI)) + return E; + + // If the Rt of the second instruction was not modified or used between + // the two instructions, we can combine the second into the first. + if (!ModifiedRegs[MI->getOperand(0).getReg()] && + !UsedRegs[MI->getOperand(0).getReg()]) { + MergeForward = false; + return MBBI; + } + + // Likewise, if the Rt of the first instruction is not modified or used + // between the two instructions, we can combine the first into the + // second. + if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && + !UsedRegs[FirstMI->getOperand(0).getReg()]) { + MergeForward = true; + return MBBI; + } + // Unable to combine these instructions due to interference in between. + // Keep looking. + } + } + + // If the instruction wasn't a matching load or store, but does (or can) + // modify memory, stop searching, as we don't have alias analysis or + // anything like that to tell us whether the access is tromping on the + // locations we care about. The big one we want to catch is calls. + // + // FIXME: Theoretically, we can do better than that for SP and FP based + // references since we can effectively know where those are touching. It's + // unclear if it's worth the extra code, though. Most paired instructions + // will be sequential, perhaps with a few intervening non-memory related + // instructions. + if (MI->mayStore() || MI->isCall()) + return E; + // Likewise, if we're matching a store instruction, we don't want to + // move across a load, as it may be reading the same location. + if (FirstMI->mayStore() && MI->mayLoad()) + return E; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return E; + } + return E; +} + +// FIXME: Currently, only supports collapsing ldr/str's to ldrd/strd's for +// V7M based cores. V7A and V7R architectures also support ldrd/strd instruction +// with a few restrictions, for example for the ldrd instruction +// the first destination register must be an even numbered register and +// second register must be (first register number + 1). We should update +// the code at some point to make it possible to generate ldrd/strd for +// these architectuers as well. +bool ARMLoadStoreOpt::LoadStoreToDoubleOpti(MachineBasicBlock &MBB) { + if (!isThumb2 || !STI->hasV7Ops() || !STI->isMClass()) { + return false; + } + bool Modified = false; + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch(MI->getOpcode()) { + default: + // Just move on to the next instruction + ++MBBI; + break; + case ARM::t2LDRi12: + case ARM::t2STRi12: { + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) { + ++MBBI; + break; + } + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!MI->getOperand(2).isImm()) { + ++MBBI; + break; + } + // Look ahead up to ScanLimit instructions for a pairable instruction. + bool MergeForward = false; + MachineBasicBlock::iterator Paired = + findMatchingInsn(MBBI, MergeForward, ScanLimit); + if (Paired != E) { + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, MergeForward); + Modified = true; + break; + } + } + ++MBBI; + break; + } + } + return Modified; +} + /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR /// ops of the same base and incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { @@ -1828,9 +2108,13 @@ for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { MachineBasicBlock &MBB = *MFI; - Modified |= LoadStoreMultipleOpti(MBB); - if (STI->hasV5TOps()) - Modified |= MergeReturnIntoLDM(MBB); + if (AlwaysCollapseToLoadStoreDouble) + Modified |= LoadStoreToDoubleOpti(MBB); + if (!Modified) { + Modified |= LoadStoreMultipleOpti(MBB); + if (STI->hasV5TOps()) + Modified |= MergeReturnIntoLDM(MBB); + } } delete RS; Index: test/CodeGen/ARM/ldrd.ll =================================================================== --- test/CodeGen/ARM/ldrd.ll +++ test/CodeGen/ARM/ldrd.ll @@ -18,6 +18,7 @@ ; M3-LABEL: t: ; M3-NOT: ldrd +; M3: umull %0 = load i64*, i64** @b, align 4 %1 = load i64, i64* %0, align 4 Index: test/CodeGen/Thumb2/aapcs.ll =================================================================== --- test/CodeGen/Thumb2/aapcs.ll +++ test/CodeGen/Thumb2/aapcs.ll @@ -33,8 +33,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) { ; CHECK-LABEL: double_on_stack: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i @@ -42,8 +41,7 @@ define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) { ; CHECK-LABEL: double_not_split: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i Index: test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll =================================================================== --- test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll +++ test/CodeGen/Thumb2/thumb2-memcpy-ldrd-strd.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+thumb2 -mcpu=cortex-m7 | FileCheck %s +@d = external global [64 x i32] +@s = external global [64 x i32] + +; Function Attrs: nounwind +define void @t1() #0 { +entry: +; CHECK-LABEL: t1: +; CHECK: ldrd +; CHECK-NEXT: ldrd +; CHECK-NEXT: strd +; CHECK-NEXT: strd +; CHECK-NEXT: ldrb +; CHECK-NEXT: strb + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1