diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h --- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h @@ -91,6 +91,8 @@ /// all registers that were disabled are removed from the list. SmallVector UpdatedCSRs; + void initUpdatedCSRs(); + /// RegAllocHints - This vector records register allocation hints for /// virtual registers. For each virtual register, it keeps a pair of hint /// type and hints vector making up the allocation hints. Only the first @@ -231,12 +233,17 @@ /// Disables the register from the list of CSRs. /// I.e. the register will not appear as part of the CSR mask. - /// \see UpdatedCalleeSavedRegs. - void disableCalleeSavedRegister(unsigned Reg); + /// \see UpdatedCSRs. + void disableCalleeSavedRegister(Register Reg); + + /// Enables the register from the list of CSRs. + /// I.e. the register will appear as part of the CSR mask. + /// \see UpdatedCSRs. + void enableCalleeSavedRegister(Register Reg); /// Returns list of callee saved registers. /// The function returns the updated CSR list (after taking into account - /// registers that are disabled from the CSR list). + /// registers that are enabled/disabled from the CSR list). const MCPhysReg *getCalleeSavedRegs() const; /// Sets the updated Callee Saved Registers list. diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp --- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp +++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp @@ -610,30 +610,54 @@ return false; } -void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) { +void MachineRegisterInfo::initUpdatedCSRs() { + if (IsUpdatedCSRsInitialized) + return; const TargetRegisterInfo *TRI = getTargetRegisterInfo(); - assert(Reg && (Reg < TRI->getNumRegs()) && - "Trying to disable an invalid register"); + const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); + for (const MCPhysReg *I = CSR; *I; ++I) + UpdatedCSRs.push_back(*I); - if (!IsUpdatedCSRsInitialized) { - const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF); - for (const MCPhysReg *I = CSR; *I; ++I) - UpdatedCSRs.push_back(*I); + // Zero value represents the end of the register list + // (no more registers should be pushed). + UpdatedCSRs.push_back(0); - // Zero value represents the end of the register list - // (no more registers should be pushed). - UpdatedCSRs.push_back(0); + IsUpdatedCSRsInitialized = true; +} - IsUpdatedCSRsInitialized = true; - } +void MachineRegisterInfo::disableCalleeSavedRegister(Register Reg) { + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + assert(Reg && (Reg < TRI->getNumRegs()) && + "Trying to disable an invalid register"); + + initUpdatedCSRs(); - // Remove the register (and its aliases from the list). + // Remove the register (and its aliases) from the CSR list. for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI), UpdatedCSRs.end()); } +void MachineRegisterInfo::enableCalleeSavedRegister(Register Reg) { + const TargetRegisterInfo *TRI = getTargetRegisterInfo(); + assert(Reg && (Reg < TRI->getNumRegs()) && + "Trying to disable an invalid register"); + + initUpdatedCSRs(); + + // Remove the null terminator from the end of the list. + assert(UpdatedCSRs.back() == 0); + UpdatedCSRs.pop_back(); + + // Add the register (and its sub-registers) to the CSR list. + for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI) + UpdatedCSRs.push_back(*SRI); + + // Put the null terminator back. + UpdatedCSRs.push_back(0); +} + const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const { if (IsUpdatedCSRsInitialized) return UpdatedCSRs.data(); diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -453,6 +453,8 @@ FrameIdx = MFI.CreateFixedSpillStackObject(Size, FixedSlot->Offset); } + LLVM_DEBUG(dbgs() << "Assigned " << RegInfo->getName(Reg) + << " to spill slot " << FrameIdx << "\n"); CS.setFrameIdx(FrameIdx); } } diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h --- a/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -56,6 +56,10 @@ void getCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const override; + void findRegDefsOutsideSaveRestore(MachineFunction &MF, + BitVector &Regs) const; + unsigned spillExtraRegsForIPRA(MachineFunction &MF, BitVector &SavedRegs, + bool HasFPRegSaves) const; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; @@ -63,9 +67,8 @@ MachineBasicBlock &MBB) const override; /// Returns true if the target will correctly handle shrink wrapping. - bool enableShrinkWrapping(const MachineFunction &MF) const override { - return true; - } + bool enableShrinkWrapping(const MachineFunction &MF) const override; + bool isProfitableForNoCSROpt(const Function &F) const override { // The no-CSR optimisation is bad for code size on ARM, because we can save // many registers with a single PUSH/POP pair. diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -71,6 +71,9 @@ SpillAlignedNEONRegs("align-neon-spills", cl::Hidden, cl::init(true), cl::desc("Align ARM NEON spills in prolog and epilog")); +static cl::opt EnableExtraSpills("arm-extra-spills", cl::Hidden, + cl::init(false)); + static MachineBasicBlock::iterator skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI, unsigned NumAlignedDPRCS2Regs); @@ -1608,6 +1611,251 @@ SavedRegs.set(ARM::R4); } +// Compute the set of registers which cannot be preserved, because they are +// either modified outside the PUSH/POP instructions, or are live at the point +// where the POP will be inserted. This only considers r0-r3, which are +// currently the only registers we voluntatrily save when the PCS doesn't +// require it. +void ARMFrameLowering::findRegDefsOutsideSaveRestore( + MachineFunction &MF, BitVector &UnsaveableRegs) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + SmallSet SaveBlocks; + SmallSet RestoreBlocks; + + if (MFI.getSavePoint()) { + SaveBlocks.insert(MFI.getSavePoint()); + RestoreBlocks.insert(MFI.getRestorePoint()); + } else { + SaveBlocks.insert(&MF.front()); + for (MachineBasicBlock &MBB : MF) + if (MBB.isReturnBlock()) + RestoreBlocks.insert(&MBB); + } + + // Walk blocks from the function entry and exits (following control flow both + // ways), stopping when we get to a save/restore block. Check for + // instructions which modify any of the registers we care about. + SmallVector WorkList; + SmallSet VisitedBlocks; + LLVM_DEBUG(dbgs() << "Entry block: " << MF.front().getName() << "\n"); + WorkList.push_back(&MF.front()); + for (MachineBasicBlock &MBB : MF) { + if (MBB.isReturnBlock()) { + LLVM_DEBUG(dbgs() << "Return block: " << MBB.getName() << "\n"); + WorkList.push_back(&MBB); + } + } + + auto CheckOutsideInst = [&UnsaveableRegs, TRI](MachineInstr &MI) { + for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { + if (MI.modifiesRegister(Reg, TRI)) { + UnsaveableRegs.set(Reg); + LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg) + << " modified by instruction " << MI << "\n"); + } + } + }; + + while (!WorkList.empty()) { + MachineBasicBlock *MBB = WorkList.pop_back_val(); + + if (VisitedBlocks.count(MBB)) + continue; + VisitedBlocks.insert(MBB); + + bool IsSave = SaveBlocks.count(MBB); + bool IsRestore = RestoreBlocks.count(MBB); + + LLVM_DEBUG(dbgs() << "Visiting block " << MBB->getName() << ", IsSave=" + << IsSave << ", IsRestore=" << IsRestore << "\n"); + + // If this is a restore block, the POP instruction will be inserted just + // before the terminator, so we need to consider any terminator + // instructions to be outside the preserved region. We also need to check + // for registers which are live at the POP insertion point, because these + // can't be restored without changing their value. + if (IsRestore) { + LivePhysRegs LPR(*TRI); + LPR.addLiveOuts(*MBB); + for (auto &Term : reverse(MBB->terminators())) { + LPR.stepBackward(Term); + CheckOutsideInst(Term); + } + + for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { + if (LPR.contains(Reg)) { + UnsaveableRegs.set(Reg); + LLVM_DEBUG(dbgs() << "Register " << TRI->getName(Reg) + << " live-out of restore block " << MBB->getName() + << "\n"); + } + } + } + + // If this block is completely outside the save/restore region, then any + // modified registers can't be preserved. A save block counts as being + // inside the saved region, with the possible exception of the last few + // instructions if it's also a restore block, handled above. We don't visit + // blocks which are completely inside the saved region and don't have any + // save/restore instructions, so don't need to check that here. + if (!IsSave && !IsRestore) + for (auto &MI : *MBB) + CheckOutsideInst(MI); + + // Walk the control flow graph in both directions, except for blocks which + // are inside the PUSH/POP region. + if (IsSave || !IsRestore) + for (auto Pred : MBB->predecessors()) + WorkList.push_back(Pred); + if (!IsSave || IsRestore) + for (auto Succ : MBB->successors()) + WorkList.push_back(Succ); + } +} + +bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // Shrink wrapping is detrimental to code size because it prevents merging + // the CSR restore and function return into one POP instruction. It also + // conflicts with saving extra registers for IPRA, because it makes more + // registers live at the PUSH/POP. + if (MF.getFunction().hasMinSize()) + return false; + + return true; +} + +// When doing inter-procedural register allocation, saving extra registers in +// [r0,r3] will allow us to keep live values in them in any callers. The extra +// saves and restores don't cost us any code-size if we are already emitting +// PUSH and POP instructions. +unsigned ARMFrameLowering::spillExtraRegsForIPRA(MachineFunction &MF, + BitVector &SavedRegs, + bool HasFPRegSaves) const { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + //ARMFunctionInfo *AFI = MF.getInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + LLVM_DEBUG(dbgs() << "Extra spills for " << MF.getName() << ": "); + + if (!EnableExtraSpills) { + LLVM_DEBUG(dbgs() << "optimisation not enabled\n"); + return 0; + } + + // If IPRA is not enabled, nothing will be able to take advantage of the + // extra saved registers. + if (!MF.getTarget().Options.EnableIPRA) { + LLVM_DEBUG(dbgs() << "IPRA disabled\n"); + return 0; + } + + // These registers will take extra time to save and restore, and will often + // go unused, so only to this at -Oz. + if (!MF.getFunction().hasMinSize()) { + LLVM_DEBUG(dbgs() << "not minsize\n"); + return 0; + } + + // If we are not currently spilling any registers, we'd need to add an extra + // PUSH/POP pair, so this isn't worth it. + if (!SavedRegs.any()) { + LLVM_DEBUG(dbgs() << "no existing push/pop\n"); + return 0; + } + + // If we can't guarantee that this definition of the function is the one + // which will be picked by the linker, then IPRA can't make use of any extra + // saved registers. + if (!MF.getFunction().isDefinitionExact()) { + LLVM_DEBUG(dbgs() << "inexact definition\n"); + return 0; + } + + int NumVisibleCallers = 0; + for (const User *U : MF.getFunction().users()) { + if (const CallBase *Call = dyn_cast(U)) { + if (Call->getCalledOperand() == &MF.getFunction()) { + ++NumVisibleCallers; + } + } + } + + // If we don't have any direct callers in the current translation unit, + // nothing will be able to take advantage of the extra saved registers. + if (NumVisibleCallers == 0) { + LLVM_DEBUG(dbgs() << "no visible callers\n"); + return 0; + } + + // If we need to emit unwind tables, these will be longer if we need to + // preserve r0-r3, so we need a lot of visible calls to make this worthwhile. + if (MF.getFunction().needsUnwindTableEntry() && NumVisibleCallers <= 8) { + LLVM_DEBUG(dbgs() << "needs unwind table\n"); + return 0; + } + + // Ok, we've decided we are going to try the optimisation. + LLVM_DEBUG(dbgs() << "enabled\n"); + + // Compute the registers which can't be preserved because they are either + // modified before the PUSH or after the POP, or are live at the point where + // the POP will be inserted. + BitVector NonPreserveableRegisters; + NonPreserveableRegisters.resize(TRI->getNumRegs()); + findRegDefsOutsideSaveRestore(MF, NonPreserveableRegisters); + + unsigned NumExtraRegs = 0; + + // We'd also like to leave some registers free so that we can use them to + // fold a small SP update into the PUSH/POP. We can't know exactly what this + // optimisation can do, because stack layout isn't finalised, but we can make + // a good enough estimate. + unsigned StackSize = MFI.estimateStackSize(MF); + + // If the stack space is large, we probably won't be able to fold the SP + // update into the push/pop, so we should use all the registers we want. If + // we have FP register saves, then the SP update will be folded into the + // VPUSH/VPOP instead, and we can use the GPRs freely. + if (StackSize > 16 || HasFPRegSaves) + StackSize = 0; + + LLVM_DEBUG(dbgs() << "Estimated " << StackSize + << " bytes of SP update being folded into push/pop\n"); + + for (Register Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3}) { + if (StackSize) { + StackSize -= 4; + LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) + << ", wanted for SP update\n"); + continue; + } + + // If we don't modify the register anywhere in this function, IPRA will + // already know that it is preserved, and there's no point in saving it. + if (!MRI.isPhysRegModified(Reg)) { + LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) + << ", not modified\n"); + continue; + } + + if (NonPreserveableRegisters[Reg]) { + LLVM_DEBUG(dbgs() << "not saving " << TRI->getName(Reg) + << ", modified outide save region\n"); + continue; + } + + LLVM_DEBUG(dbgs() << "also saving " << TRI->getName(Reg) << " for IPRA\n"); + SavedRegs.set(Reg); + MRI.enableCalleeSavedRegister(Reg); + ++NumExtraRegs; + } + + return NumExtraRegs; +} + void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { @@ -1998,6 +2246,14 @@ << "\n"); } + // When using IPRA, we might want to preserve some of r0-r3, to reduce + // register pressure in our callers. + unsigned ExtraIPRASpills = + spillExtraRegsForIPRA(MF, SavedRegs, NumFPRSpills != 0); + NumGPRSpills += ExtraIPRASpills; + if (ExtraIPRASpills) + CS1Spilled = true; + // Avoid spilling LR in Thumb1 if there's a tail call: it's expensive to // restore LR in that case. bool ExpensiveLRRestore = AFI->isThumb1OnlyFunction() && MFI.hasTailCall(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -2458,25 +2458,24 @@ RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. - if (!isTailCall) { - const uint32_t *Mask; - const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); - if (isThisReturn) { - // For 'this' returns, use the R0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(MF, CallConv); - if (!Mask) { - // Set isThisReturn to false if the calling convention is not one that - // allows 'returned' to be modeled in this way, so LowerCallResult does - // not try to pass 'this' straight through - isThisReturn = false; - Mask = ARI->getCallPreservedMask(MF, CallConv); - } - } else + const uint32_t *Mask; + const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); + if (isThisReturn) { + // For 'this' returns, use the R0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(MF, CallConv); + if (!Mask) { + // Set isThisReturn to false if the calling convention is not one that + // allows 'returned' to be modeled in this way, so LowerCallResult does + // not try to pass 'this' straight through + isThisReturn = false; Mask = ARI->getCallPreservedMask(MF, CallConv); + } + } else + Mask = ARI->getCallPreservedMask(MF, CallConv); - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); - } + + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -216,6 +216,10 @@ break; } LLVM_FALLTHROUGH; + case ARM::R0: + case ARM::R1: + case ARM::R2: + case ARM::R3: case ARM::R4: case ARM::R5: case ARM::R6: @@ -848,7 +852,8 @@ if (!LoRegsToSave.none()) { MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); - for (unsigned Reg : {ARM::R4, ARM::R5, ARM::R6, ARM::R7, ARM::LR}) { + for (unsigned Reg : {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, ARM::R5, + ARM::R6, ARM::R7, ARM::LR}) { if (LoRegsToSave[Reg]) { bool isKill = !MRI.isLiveIn(Reg); if (isKill && !MRI.isReserved(Reg)) @@ -956,6 +961,9 @@ llvm_unreachable("callee-saved register of unexpected class"); } + if (Reg == ARM::LR) + I.setRestored(false); + // If this is a low register not used as the frame pointer, we may want to // use it for restoring the high registers. if ((ARM::tGPRRegClass.contains(Reg)) && @@ -980,6 +988,9 @@ static const unsigned AllCopyRegs[] = {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, ARM::R5, ARM::R6, ARM::R7}; static const unsigned AllHighRegs[] = {ARM::R8, ARM::R9, ARM::R10, ARM::R11}; + static const unsigned AllLoRegs[] = {ARM::R0, ARM::R1, ARM::R2, + ARM::R3, ARM::R4, ARM::R5, + ARM::R6, ARM::R7, ARM::LR}; const unsigned *AllCopyRegsEnd = std::end(AllCopyRegs); const unsigned *AllHighRegsEnd = std::end(AllHighRegs); @@ -1018,16 +1029,10 @@ BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); bool NeedsPop = false; - for (unsigned i = CSI.size(); i != 0; --i) { - CalleeSavedInfo &Info = CSI[i-1]; - unsigned Reg = Info.getReg(); - - // High registers (excluding lr) have already been dealt with - if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR)) + for (unsigned Reg : AllLoRegs) { + if (!LoRegsToRestore[Reg]) continue; - if (Reg == ARM::LR) { - Info.setRestored(false); if (!MBB.succ_empty() || MI->getOpcode() == ARM::TCRETURNdi || MI->getOpcode() == ARM::TCRETURNri) diff --git a/llvm/test/CodeGen/ARM/ipra-extra-spills.ll b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ipra-extra-spills.ll @@ -0,0 +1,406 @@ +; RUN: llc -mtriple armv7a--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ARM +; RUN: llc -mtriple thumbv7a--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2 +; RUN: llc -mtriple thumbv6m--none-eabi -enable-ipra=true -arm-extra-spills -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1 + +; This clobbers r0, and already needs a push/pop, so we also save and restore +; r0. The push of r11 is to maintain stack alignment (though that isn't +; technically needed in this example). +define void @test_r0_r4() minsize nounwind { +; CHECK-LABEL: test_r0_r4: +; ARM: .save {r0, r4, r11, lr} +; ARM: push {r0, r4, r11, lr} +; ARM: pop {r0, r4, r11, pc} +; THUMB1: .save {r0, r4, r7, lr} +; THUMB1: push {r0, r4, r7, lr} +; THUMB1: pop {r0, r4, r7, pc} +; THUMB2: .save {r0, r4, r7, lr} +; THUMB2: push {r0, r4, r7, lr} +; THUMB2: pop {r0, r4, r7, pc} + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; This clobbers r0-r3, and already needs a push/pop, so we also save and +; restore all of them. +define void @test_r0_r1_r2_r3_r4() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4: +; CHECK: .save {r0, r1, r2, r3, r4, lr} +; CHECK: push {r0, r1, r2, r3, r4, lr} +; CHECK: pop {r0, r1, r2, r3, r4, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"() + ret void +} + +; Check that IPRA does make use of the extra saved registers. +define void @test_ipra() nounwind { +; CHECK-LABEL: test_ipra: +; CHECK: ASM1: r0, r1, r2, r3 +; CHECK-NOT: r0 +; CHECK-NOT: r1 +; CHECK-NOT: r2 +; CHECK-NOT: r3 +; CHECK: bl test_r0_r1_r2_r3_r4 +; CHECK-NOT: r0 +; CHECK-NOT: r1 +; CHECK-NOT: r2 +; CHECK-NOT: r3 +; CHECK: ASM2: r0, r1, r2, r3 + %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() + %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0 + %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1 + %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2 + %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3 + call void @test_r0_r1_r2_r3_r4() + call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3) + ret void +} + +; This clobbers r0-r3, but doesn't otherwise need a push/pop, so we don't add +; them. +define void @test_r0_r1_r2_r3() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3: +; CHECK-NOT: push +; CHECK-NOT: pop + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"() + ret void +} + +; This isn't called in this function, so we don't push any extra registers. +define void @test_r0_r4_not_called() minsize nounwind { +; CHECK-LABEL: test_r0_r4_not_called: +; CHECK: .save {r4, lr} +; CHECK: push {r4, lr} +; CHECK: pop {r4, pc} +; CHECK-NOT: push +; CHECK-NOT: pop + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; This function is only optsize, not minsize, so we don't add any extra saves. +define void @test_r0_r4_not_minsize() optsize nounwind { +; CHECK-LABEL: test_r0_r4_not_minsize: +; CHECK: .save {r4, lr} +; CHECK: push {r4, lr} +; CHECK: pop {r4, pc} +; CHECK-NOT: push +; CHECK-NOT: pop + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; This function is not an exact definition (the linker could pick an +; alternative version of it), so we don't add any extra saves. +define linkonce_odr void @test_r0_r4_not_exact() minsize nounwind { +; CHECK-LABEL: test_r0_r4_not_exact: +; CHECK: .save {r4, lr} +; CHECK: push {r4, lr} +; CHECK: pop {r4, pc} +; CHECK-NOT: push +; CHECK-NOT: pop + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; This clobbers r0-r3, but returns a value in r0, so only r1-r3 are saved. +define i32 @test_r0_r1_r2_r3_r4_return_1() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_1: +; ARM: .save {r1, r2, r3, r4, r11, lr} +; ARM: push {r1, r2, r3, r4, r11, lr} +; ARM: pop {r1, r2, r3, r4, r11, pc} +; THUMB1: .save {r1, r2, r3, r4, r7, lr} +; THUMB1: push {r1, r2, r3, r4, r7, lr} +; THUMB1: pop {r1, r2, r3, r4, r7, pc} +; THUMB2: .save {r1, r2, r3, r4, r7, lr} +; THUMB2: push {r1, r2, r3, r4, r7, lr} +; THUMB2: pop {r1, r2, r3, r4, r7, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"() + ret i32 42 +} + +; This clobbers r0-r3, but returns a value in r0 and r1, so only r2-r3 are +; saved. +define i64 @test_r0_r1_r2_r3_r4_return_2() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_2: +; CHECK: .save {r2, r3, r4, lr} +; CHECK: push {r2, r3, r4, lr} +; CHECK: pop {r2, r3, r4, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"() + ret i64 42 +} + +; This clobbers r0-r3, but returns a value in all of r0-r3, so none of them can +; be saved. +define i128 @test_r0_r1_r2_r3_r4_return_4() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_4: +; CHECK: .save {r4, lr} +; CHECK: push {r4, lr} +; CHECK: pop {r4, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"() + ret i128 42 +} + +; This clobbers r0-r3, and returns a value in s0, so all of r0-r3 are saved (we +; previously only checked the number of return registers, ignoring their +; class). +define arm_aapcs_vfpcc float @test_r0_r1_r2_r3_r4_return_float() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_return_float: +; ARM: .save {r0, r1, r2, r3, r4, lr} +; ARM: push {r0, r1, r2, r3, r4, lr} +; ARM: pop {r0, r1, r2, r3, r4, pc} +; THUMB1: .save {r1, r2, r3, r4, r7, lr} +; THUMB1: push {r1, r2, r3, r4, r7, lr} +; THUMB1: pop {r1, r2, r3, r4, r7, pc} +; THUMB2: .save {r0, r1, r2, r3, r4, lr} +; THUMB2: push {r0, r1, r2, r3, r4, lr} +; THUMB2: pop {r0, r1, r2, r3, r4, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4}"() + ret float 42.0 +} + +; Saving of high registers in thumb1 is more complicated, because they need to +; be copied down to low registers to use push/pop instructions. Luckily, the +; extra registers we are preserving are low registers, which are handled by the +; outer-most push/pop pair, so this doesn't interact badly. +define void @test_save_high_regs() minsize nounwind { +; CHECK-LABEL: test_save_high_regs: +; ARM: .save {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr} +; ARM: push {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr} +; ARM: pop {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc} +; THUMB1: .save {r0, r1, r2, r3, r7, lr} +; THUMB1-NEXT: push {r0, r1, r2, r3, r7, lr} +; THUMB1-NEXT: mov lr, r11 +; THUMB1-NEXT: mov r7, r10 +; THUMB1-NEXT: mov r3, r9 +; THUMB1-NEXT: mov r2, r8 +; THUMB1-NEXT: .save {r8, r9, r10, r11} +; THUMB1-NEXT: push {r2, r3, r7, lr} +; THUMB1: pop {r0, r1, r2, r3} +; THUMB1-NEXT: mov r8, r0 +; THUMB1-NEXT: mov r9, r1 +; THUMB1-NEXT: mov r10, r2 +; THUMB1-NEXT: mov r11, r3 +; THUMB1-NEXT: pop {r0, r1, r2, r3, r7, pc} +; THUMB2: .save {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr} +; THUMB2: push.w {r0, r1, r2, r3, r7, r8, r9, r10, r11, lr} +; THUMB2: pop.w {r0, r1, r2, r3, r7, r8, r9, r10, r11, pc} + call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r8},~{r9},~{r10},~{r11}"() + ret void +} + +; We can also use extra registers in the PUSH/POP instructions to move the SP +; to make space for local variables. These registers aren't preserved, because +; the space they are saved in is used for the local variable. We try to back +; off the extra-CSRs optimisation to allow this to still happen. In this case, +; there are 8 bytes of stack space needed, so we preserve two argument +; registers and use the other two for the SP update. +define void @test_r0_r1_r2_r3_r4_stack8() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8: +; CHECK: .save {r2, r3, r4, lr} +; CHECK: push {r0, r1, r2, r3, r4, lr} +; CHECK: pop {r0, r1, r2, r3, r4, pc} + %a = alloca [2 x i32], align 4 + call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([2 x i32]* %a, i32 42) + ret void +} + +; Check that, when the above function is called, r0 and r1 (used for the SP +; updates) are considered clobbered, and r2 and r3 are preserved. +define void @test_r0_r1_r2_r3_r4_stack8_caller() nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack8_caller: +; CHECK: ASM1: r0, r1, r2, r3 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: bl test_r0_r1_r2_r3_r4 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: @APP +; CHECK-NEXT: ASM2: r0, r1, r2, r3 + %regs = call { i32, i32, i32, i32 } asm sideeffect "// ASM1: $0, $1, $2, $3", "={r0},={r1},={r2},={r3}"() + %r0 = extractvalue { i32, i32, i32, i32 } %regs, 0 + %r1 = extractvalue { i32, i32, i32, i32 } %regs, 1 + %r2 = extractvalue { i32, i32, i32, i32 } %regs, 2 + %r3 = extractvalue { i32, i32, i32, i32 } %regs, 3 + call void @test_r0_r1_r2_r3_r4_stack8() + call void asm sideeffect "// ASM2: $0, $1, $2, $3", "{r0},{r1},{r2},{r3}"(i32 %r0, i32 %r1, i32 %r2, i32 %r3) + ret void +} + +; Like @test_r0_r1_r2_r3_r4_stack8, but 16 bytes of stack space are needed, so +; all of r0-r3 are used for the SP update, and not preserved. +define void @test_r0_r1_r2_r3_r4_stack16() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack16: +; CHECK: .save {r4, lr} +; CHECK: push {r0, r1, r2, r3, r4, lr} +; CHECK: pop {r0, r1, r2, r3, r4, pc} + %a = alloca [4 x i32], align 4 + call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([4 x i32]* %a, i32 42) + ret void +} + +; If more than 16 bytes of stack space are needed, it's unlikely that the +; SP-update folding optimisation will succeed, so we revert back to preserving +; r0-r3 for use in our callers. +define void @test_r0_r1_r2_r3_r4_stack24() minsize nounwind { +; CHECK-LABEL: test_r0_r1_r2_r3_r4_stack24: +; CHECK: .save {r0, r1, r2, r3, r4, lr} +; CHECK: push {r0, r1, r2, r3, r4, lr} +; CHECK: pop {r0, r1, r2, r3, r4, pc} + %a = alloca [6 x i32], align 4 + call void asm sideeffect "str $1, [$0]; str $1, [$0, #4]", "{r0},{r1},~{r2},~{r3},~{r4}"([6 x i32]* %a, i32 42) + ret void +} + +define i32 @tail_callee(i32 %a, i32 %b) minsize nounwind { +entry: + tail call void asm sideeffect "", "~{r2}"() + ret i32 %a +} + +; The tail call happens outside the save/restore region, so prevents us from +; preserving some registers. r0 and r1 are outgoing arguments to the tail-call, +; so can't be preserved. r2 is modified inside the tail-called function, so +; can't be presrved. r3 is known to be preserved by the callee, so can be +; presrved. For Thumb1, we can't (efficiently) use a tail-call here, so r1-r3 +; are all preserved, with r0 being the return value. +define i32 @test_tail_call() minsize nounwind { +entry: +; CHECK-LABEL: test_tail_call: +; ARM: .save {r3, lr} +; ARM: push {r3, lr} +; ARM: pop {r3, lr} +; ARM: b tail_callee +; THUMB2: .save {r3, lr} +; THUMB2: push {r3, lr} +; THUMB2: pop.w {r3, lr} +; THUMB2: b tail_callee +; THUMB1: .save {r1, r2, r3, lr} +; THUMB1: push {r1, r2, r3, lr} +; THUMB1: bl tail_callee +; THUMB1: pop {r1, r2, r3, pc} + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"() + %call = tail call i32 @tail_callee(i32 3, i32 4) + ret i32 %call +} + +declare i32 @tail_callee_external(i32 %a, i32 %b) + +; If we tail-call an external function, it could clobber any of r0-r3. +define i32 @test_tail_call_external() minsize nounwind { +entry: +; CHECK-LABEL: test_tail_call_external: +; ARM: .save {r11, lr} +; ARM: push {r11, lr} +; ARM: pop {r11, lr} +; ARM: b tail_callee_external +; THUMB2: .save {r7, lr} +; THUMB2: push {r7, lr} +; THUMB2: pop.w {r7, lr} +; THUMB2: b tail_callee_external +; THUMB1: .save {r1, r2, r3, lr} +; THUMB1: push {r1, r2, r3, lr} +; THUMB1: bl tail_callee_external +; THUMB1: pop {r1, r2, r3, pc} + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"() + %call = tail call i32 @tail_callee_external(i32 3, i32 4) + ret i32 %call +} + +define linkonce_odr i32 @tail_callee_linkonce_odr(i32 %a, i32 %b) minsize nounwind { +entry: + tail call void asm sideeffect "", "~{r2}"() + ret i32 %a +} + +; If a tail-callee has an interposable linkage type (such as linkonce_odr), we +; can't assume the linker will pick the definition we can see, so must assume +; it clobbers all of r0-r3. +define i32 @test_tail_call_linkonce_odr() minsize nounwind { +entry: +; CHECK-LABEL: test_tail_call_linkonce_odr: +; ARM: .save {r11, lr} +; ARM: push {r11, lr} +; ARM: pop {r11, lr} +; ARM: b tail_callee_linkonce_odr +; THUMB2: .save {r7, lr} +; THUMB2: push {r7, lr} +; THUMB2: pop.w {r7, lr} +; THUMB2: b tail_callee_linkonce_odr +; THUMB1: .save {r1, r2, r3, lr} +; THUMB1: push {r1, r2, r3, lr} +; THUMB1: bl tail_callee_linkonce_odr +; THUMB1: pop {r1, r2, r3, pc} + tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{lr}"() + %call = tail call i32 @tail_callee_linkonce_odr(i32 3, i32 4) + ret i32 %call +} + +; This function doesn't have the nounwind attribute, so unwind tables will be +; emitted. Saving r0-r3 requires a longer unwind instruction sequence, which +; results in an increase in total code size if there are few callers to make +; use of the extra registers. +define void @test_unwind_tables() minsize { +; CHECK-LABEL: test_unwind_tables: +; ARM: .save {r4, lr} +; ARM: push {r4, lr} +; ARM: pop {r4, pc} +; THUMB1: .save {r4, lr} +; THUMB1: push {r4, lr} +; THUMB1: pop {r4, pc} +; THUMB2: .save {r4, lr} +; THUMB2: push {r4, lr} +; THUMB2: pop {r4, pc} + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; This requires an unwind table, but has many call sites, so overall we expect +; the benefits to outweigh the size increase of the unwind table. +define void @test_unwind_tables_many_calls() minsize { +; CHECK-LABEL: test_unwind_tables_many_calls: +; ARM: .save {r0, r4, r11, lr} +; ARM: push {r0, r4, r11, lr} +; ARM: pop {r0, r4, r11, pc} +; THUMB1: .save {r0, r4, r7, lr} +; THUMB1: push {r0, r4, r7, lr} +; THUMB1: pop {r0, r4, r7, pc} +; THUMB2: .save {r0, r4, r7, lr} +; THUMB2: push {r0, r4, r7, lr} +; THUMB2: pop {r0, r4, r7, pc} + call void asm sideeffect "", "~{r0},~{r4}"() + ret void +} + +; We don't do this optimisation is there are no callers in the same translation +; unit (otherwise IPRA wouldn't be able to take advantage of the extra saved +; registers), so most functions in this file are called here. +define void @caller() { +; CHECK-LABEL: caller: + call void @test_r0_r4() + call void @test_r0_r1_r2_r3_r4() + call void @test_r0_r1_r2_r3() + call void @test_r0_r4_not_minsize() + call void @test_r0_r4_not_exact() + %t1 = call i32 @test_r0_r1_r2_r3_r4_return_1() + %t2 = call i64 @test_r0_r1_r2_r3_r4_return_2() + %t3 = call i128 @test_r0_r1_r2_r3_r4_return_4() + %t4 = call float @test_r0_r1_r2_r3_r4_return_float() + call void @test_save_high_regs() + call void @test_r0_r1_r2_r3_r4_stack16() + call void @test_r0_r1_r2_r3_r4_stack24() + %t5 = call i32 @test_tail_call() + %t6 = call i32 @test_tail_call_external() + %t7 = call i32 @test_tail_call_linkonce_odr() + call void @test_unwind_tables() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + call void @test_unwind_tables_many_calls() + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll --- a/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll +++ b/llvm/test/CodeGen/Thumb2/ifcvt-minsize.ll @@ -66,16 +66,13 @@ define void @f3(i32 %x) #0 { ; CHECK-LABEL: f3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: cmp r0, #1 -; CHECK-NEXT: bne .LBB2_2 -; CHECK-NEXT: @ %bb.1: @ %t ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: bl fn -; CHECK-NEXT: pop.w {r7, lr} -; CHECK-NEXT: .LBB2_2: @ %f -; CHECK-NEXT: bx lr +; CHECK-NEXT: cmp r0, #1 +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r0, #0 +; CHECK-NEXT: bleq fn +; CHECK-NEXT: pop {r7, pc} entry: %p = icmp eq i32 %x, 1 br i1 %p, label %t, label %f