Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -110,6 +110,10 @@ /// Index into the basic block where the merged instruction will be /// inserted. (See MemOpQueueEntry.Position) unsigned InsertPos; + /// Whether the instructions can be merged into a ldm/stm instruction. + bool CanMergeToLoadStoreMulti; + /// Whether the instructions can be merged into a ldrd/strd instruction. + bool CanMergeToLoadStoreDouble; }; BumpPtrAllocator Allocator; SmallVector Candidates; @@ -121,11 +125,14 @@ MachineBasicBlock::iterator MBBI, DebugLoc DL, unsigned Base, unsigned WordOffset, ARMCC::CondCodes Pred, unsigned PredReg); - MachineInstr *MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, int Offset, - unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, - ArrayRef> Regs); + MachineInstr *CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs); + MachineInstr *CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const; void FormCandidates(const MemOpQueue &MemOps); MachineInstr *MergeOpsUpdate(const MergeCandidate &Cand); bool FixInvalidRegPairOp(MachineBasicBlock &MBB, @@ -554,12 +561,10 @@ /// Create and insert a LDM or STM with Base as base register and registers in /// Regs as the register operands that would be loaded / stored. It returns /// true if the transformation is done. -MachineInstr * -ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, int Offset, - unsigned Base, bool BaseKill, unsigned Opcode, - ARMCC::CondCodes Pred, unsigned PredReg, DebugLoc DL, - ArrayRef> Regs) { +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) { unsigned NumRegs = Regs.size(); assert(NumRegs > 1); @@ -742,6 +747,28 @@ return MIB.getInstr(); } +MachineInstr *ARMLoadStoreOpt::CreateLoadStoreDouble(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertBefore, int Offset, unsigned Base, + bool BaseKill, unsigned Opcode, ARMCC::CondCodes Pred, unsigned PredReg, + DebugLoc DL, ArrayRef> Regs) const { + assert((isi32Load(Opcode) || isi32Store(Opcode)) && + "Must have interger load or store"); + unsigned LoadStoreOpcode = isi32Load(Opcode) ? ARM::t2LDRDi8 : ARM::t2STRDi8; + + assert(Regs.size() == 2); + MachineInstrBuilder MIB = BuildMI(MBB, InsertBefore, DL, + TII->get(LoadStoreOpcode)); + if (LoadStoreOpcode == ARM::t2LDRDi8) { + MIB.addReg(Regs[0].first, RegState::Define) + .addReg(Regs[1].first, RegState::Define); + } else { + MIB.addReg(Regs[0].first, getKillRegState(Regs[0].second)) + .addReg(Regs[1].first, getKillRegState(Regs[1].second)); + } + MIB.addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); + return MIB.getInstr(); +} + /// Call MergeOps and update MemOps and merges accordingly on success. MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { const MachineInstr *First = Cand.Instrs.front(); @@ -790,7 +817,12 @@ unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(First, PredReg); DebugLoc DL = First->getDebugLoc(); - MachineInstr *Merged = MergeOps(MBB, InsertBefore, Offset, Base, BaseKill, + MachineInstr *Merged = nullptr; + if (Cand.CanMergeToLoadStoreDouble) + Merged = CreateLoadStoreDouble(MBB, InsertBefore, Offset, Base, BaseKill, + Opcode, Pred, PredReg, DL, Regs); + if (!Merged && Cand.CanMergeToLoadStoreMulti) + Merged = CreateLoadStoreMulti(MBB, InsertBefore, Offset, Base, BaseKill, Opcode, Pred, PredReg, DL, Regs); if (!Merged) return nullptr; @@ -852,6 +884,12 @@ return Merged; } +static bool isValidLoadStoreDoubleOffset(int Offset) { + unsigned Value = abs(Offset); + // we support an 8 bit immediate which is internally multiplied by 4. + return (Value % 4) == 0 && Value < 1024; +} + /// Find candidates for load/store multiple merge in list of MemOpQueueEntries. void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { const MachineInstr *FirstMI = MemOps[0].MI; @@ -890,6 +928,12 @@ unsigned Latest = SIndex; unsigned Earliest = SIndex; unsigned Count = 1; + bool CanMergeToLoadStoreDouble = isNotVFP && STI->isThumb2() + && isValidLoadStoreDoubleOffset(Offset) + // ARM errata 602117: LDRD with base in list may result in incorrect base + // register when interrupted or faulted. + && (!STI->isCortexM3() || PReg != getLoadStoreBaseOp(*MI).getReg()); + bool CanMergeToLoadStoreMulti = true; // Merge additional instructions fulfilling LDM/STM constraints. for (unsigned I = SIndex+1;;) { @@ -898,29 +942,40 @@ const MachineOperand &MO = getLoadStoreRegOp(*MemOps[I].MI); unsigned Reg = MO.getReg(); unsigned RegNum = MO.isUndef() ? UINT_MAX : TRI->getEncodingValue(Reg); + bool CanMergeThisToLoadStoreMulti = false; + bool CanMergeThisToLoadStoreDouble = false; // Register numbers must be in ascending order. For VFP / NEON load and // store multiples, the registers must also be consecutive and within // the limit on the number of registers per instruction. - if (Reg != ARM::SP && - NewOffset == Offset + (int)Size && - ((isNotVFP && RegNum > PRegNum) || - ((Count < Limit) && RegNum == PRegNum+1)) && - // On Swift we don't want vldm/vstm to start with a odd register num - // because Q register unaligned vldm/vstm need more uops. - (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { - // Track MemOp with latest and earliest position (Positions are - // counted in reverse). - unsigned Position = MemOps[I].Position; - if (Position < MemOps[Latest].Position) - Latest = I; - else if (Position > MemOps[Earliest].Position) - Earliest = I; - // Prepare for next MemOp. - Offset += Size; - PRegNum = RegNum; - ++Count; - ++I; - continue; + if (NewOffset == Offset + (int)Size) { + if (CanMergeToLoadStoreMulti && Reg != ARM::SP && + ((isNotVFP && RegNum > PRegNum) || + ((Count < Limit) && RegNum == PRegNum+1)) && + // On Swift we don't want vldm/vstm to start with a odd register + // num because Q register unaligned vldm/vstm need more uops. + (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) { + CanMergeThisToLoadStoreMulti = true; + } + if (CanMergeToLoadStoreDouble && Count <= 1) { + CanMergeThisToLoadStoreDouble = true; + } + if (CanMergeThisToLoadStoreMulti || CanMergeThisToLoadStoreDouble) { + CanMergeToLoadStoreMulti &= CanMergeThisToLoadStoreMulti; + CanMergeToLoadStoreDouble &= CanMergeThisToLoadStoreDouble; + // Track MemOp with latest and earliest position (Positions are + // counted in reverse). + unsigned Position = MemOps[I].Position; + if (Position < MemOps[Latest].Position) + Latest = I; + else if (Position > MemOps[Earliest].Position) + Earliest = I; + // Prepare for next MemOp. + Offset += Size; + PRegNum = RegNum; + ++Count; + ++I; + continue; + } } } @@ -931,6 +986,10 @@ Candidate->LatestMIIdx = Latest - SIndex; Candidate->EarliestMIIdx = Earliest - SIndex; Candidate->InsertPos = MemOps[Latest].Position; + if (Count == 1) + CanMergeToLoadStoreMulti = CanMergeToLoadStoreDouble = false; + Candidate->CanMergeToLoadStoreDouble = CanMergeToLoadStoreDouble; + Candidate->CanMergeToLoadStoreMulti = CanMergeToLoadStoreMulti; Candidates.push_back(Candidate); // Restart Chain Building. SIndex = I; @@ -1674,12 +1733,17 @@ // Go through list of candidates and merge. bool Changed = false; for (const MergeCandidate *Candidate : Candidates) { - if (Candidate->Instrs.size() > 1) { + if (Candidate->CanMergeToLoadStoreMulti || + Candidate->CanMergeToLoadStoreDouble) { MachineInstr *Merged = MergeOpsUpdate(*Candidate); // Merge preceding/trailing base inc/dec into the merged op. if (Merged) { - MergeBaseUpdateLSMultiple(Merged); Changed = true; + unsigned Opcode = Merged->getOpcode(); + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + ; // No merging code yet. + else + MergeBaseUpdateLSMultiple(Merged); } else { for (MachineInstr *MI : Candidate->Instrs) { if (MergeBaseUpdateLoadStore(MI)) Index: test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll =================================================================== --- test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll +++ test/CodeGen/ARM/2013-05-02-AAPCS-ByVal-Structs-C4-C5-VFP.ll @@ -25,8 +25,7 @@ ;CHECK: push {r7, lr} ;CHECK: sub sp, #4 ;CHECK: add r0, sp, #12 - ;CHECK: str r2, [sp, #16] - ;CHECK: str r1, [sp, #12] + ;CHECK: strd r1, r2, [sp, #12] ;CHECK: bl fooUseStruct call void @fooUseStruct(%st_t* %p1) ret void Index: test/CodeGen/ARM/byval-align.ll =================================================================== --- test/CodeGen/ARM/byval-align.ll +++ test/CodeGen/ARM/byval-align.ll @@ -28,8 +28,7 @@ ; CHECK: push {r4, r7, lr} ; CHECK: add r7, sp, #4 -; CHECK-DAG: str r2, [r7, #8] -; CHECK-DAG: str r3, [r7, #12] +; CHECK: strd r2, r3, [r7, #8] ; CHECK: ldr r0, [r7, #8] Index: test/CodeGen/ARM/ldrd.ll =================================================================== --- test/CodeGen/ARM/ldrd.ll +++ test/CodeGen/ARM/ldrd.ll @@ -110,5 +110,25 @@ ret void } +; CHECK-LABEL: strd_spill_ldrd_reload: +; A8: strd r1, r0, [sp] +; M3: strd r1, r0, [sp] +; BASIC: strd r1, r0, [sp] +; GREEDY: strd r0, r1, [sp] +; CHECK: @ InlineAsm Start +; CHECK: @ InlineAsm End +; A8: ldrd r2, r1, [sp] +; M3: ldrd r2, r1, [sp] +; BASIC: ldrd r2, r1, [sp] +; GREEDY: ldrd r1, r2, [sp] +; CHECK: bl{{x?}} _extfunc +define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) { + ; force %v0 and %v1 to be spilled + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"() + ; force the reloaded %v0, %v1 into different registers + call void @extfunc(i32 0, i32 %v0, i32 %v1, i32 7) + ret void +} + declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind Index: test/CodeGen/ARM/memset-inline.ll =================================================================== --- test/CodeGen/ARM/memset-inline.ll +++ test/CodeGen/ARM/memset-inline.ll @@ -4,8 +4,7 @@ entry: ; CHECK-LABEL: t1: ; CHECK: movs r1, #0 -; CHECK: str r1, [r0] -; CHECK: str r1, [r0, #4] +; CHECK: strd r1, r1, [r0] ; CHECK: str r1, [r0, #8] call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false) ret void Index: test/CodeGen/Thumb2/aapcs.ll =================================================================== --- test/CodeGen/Thumb2/aapcs.ll +++ test/CodeGen/Thumb2/aapcs.ll @@ -33,8 +33,7 @@ define double @double_on_stack(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) { ; CHECK-LABEL: double_on_stack: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i @@ -42,8 +41,7 @@ define double @double_not_split(double %a, double %b, double %c, double %d, double %e, double %f, double %g, float %h, double %i) { ; CHECK-LABEL: double_not_split: -; SOFT: ldr r0, [sp, #48] -; SOFT: ldr r1, [sp, #52] +; SOFT: ldrd r0, r1, [sp, #48] ; HARD: vldr d0, [sp] ; CHECK-NEXT: bx lr ret double %i