diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -42,6 +42,7 @@ FunctionPass *createAArch64IndirectThunks(); FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +ModulePass *createAArch64LowerHomogeneousPrologEpilogPass(); FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); @@ -77,6 +78,7 @@ void initializeAArch64SLSHardeningPass(PassRegistry&); void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -105,6 +105,16 @@ } private: + /// Returns true if a homogeneous prolog or epilog code can be emitted + /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo + /// instructions are emitted in place. When Exit block is given, this check is + /// for epilog. + bool homogeneousPrologEpilog(MachineFunction &MF, + MachineBasicBlock *Exit = nullptr) const; + + /// Returns true if CSRs should be paired. + bool producePairRegisters(MachineFunction &MF) const; + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -175,6 +175,73 @@ cl::desc("merge settag instruction in function epilog"), cl::init(true), cl::Hidden); +cl::opt<bool> EnableHomogeneousPrologEpilog( + "homogeneous-prolog-epilog", cl::init(false), cl::Hidden, + cl::desc("Emit homogeneous prologue and epilogue for the size " + "optimization (default = off)")); + +static bool produceCompactUnwindFrame(MachineFunction &MF); +static bool needsWinCFI(const MachineFunction &MF); + +static uint64_t getArgumentPopSize(MachineFunction &MF, + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || + RetOpcode == AArch64::TCRETURNri || + RetOpcode == AArch64::TCRETURNriBTI; + } + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + + uint64_t ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + return ArgumentPopSize; +} + +/// Returns true if a homogeneous prolog or epilog code can be emitted +/// for the size optimization. If possible, a frame helper call is injected. +/// When Exit block is given, this check is for epilog. +bool AArch64FrameLowering::homogeneousPrologEpilog( + MachineFunction &MF, MachineBasicBlock *Exit) const { + if (!MF.getFunction().hasOptSize()) + return false; + if (!EnableHomogeneousPrologEpilog) + return false; + if (ReverseCSRRestoreSeq) + return false; + if (EnableRedZone) + return false; + if (needsWinCFI(MF)) + return false; + if (MF.getFrameInfo().hasVarSizedObjects()) + return false; + if (MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF)) + return false; + if (Exit && getArgumentPopSize(MF, *Exit)) + return false; + return true; +} + +bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { + return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF); +} + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// This is the biggest offset to the stack pointer we can encode in aarch64 @@ -472,6 +539,8 @@ const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (homogeneousPrologEpilog(MF)) + return false; if (AFI->getLocalStackSize() == 0) return false; @@ -1025,6 +1094,9 @@ {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); NumBytes = 0; + } else if (homogeneousPrologEpilog(MF)) { + // Stack has been already adjusted. + NumBytes -= PrologueSaveSize; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); @@ -1416,7 +1488,6 @@ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; - bool IsTailCallReturn = false; bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; bool IsFunclet = false; @@ -1427,10 +1498,6 @@ if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); - unsigned RetOpcode = MBBI->getOpcode(); - IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || - RetOpcode == AArch64::TCRETURNri || - RetOpcode == AArch64::TCRETURNriBTI; IsFunclet = isFuncletReturnInstr(*MBBI); } @@ -1445,21 +1512,7 @@ // Initial and residual are named for consistency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = 0; - if (IsTailCallReturn) { - MachineOperand &StackAdjust = MBBI->getOperand(1); - - // For a tail-call in a callee-pops-arguments environment, some or all of - // the stack may actually be in use for the call's arguments, this is - // calculated during LowerCall and consumed here... - ArgumentPopSize = StackAdjust.getImm(); - } else { - // ... otherwise the amount to pop is *all* of the argument space, - // conveniently stored in the MachineFunctionInfo by - // LowerFormalArguments. This will, of course, be zero for the C calling - // convention. - ArgumentPopSize = AFI->getArgumentStackToRestore(); - } + uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); // The stack frame should be like below, // @@ -1502,6 +1555,26 @@ // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); + if (homogeneousPrologEpilog(MF, &MBB)) { + assert(!NeedsWinCFI); + auto LastPopI = MBB.getFirstTerminator(); + if (LastPopI != MBB.begin()) { + auto HomogeneousEpilog = std::prev(LastPopI); + if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog) + LastPopI = HomogeneousEpilog; + } + + // Adjust local stack + uint64_t LocalStackSize = AFI->getLocalStackSize(); + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + {(int64_t)LocalStackSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI); + + // SP has been already adjusted while restoring callee save regs. + // We've bailed-out the case with adjusting SP for arguments. + assert(AfterCSRPopSize == 0); + return; + } bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. @@ -2165,6 +2238,22 @@ MBB.addLiveIn(AArch64::X18); } + if (homogeneousPrologEpilog(MF)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) + .setMIFlag(MachineInstr::FrameSetup); + + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Implicit); + MIB.addReg(RPI.Reg2, RegState::Implicit); + + // Update register live in. + if (!MRI.isReserved(RPI.Reg1)) + MBB.addLiveIn(RPI.Reg1); + if (!MRI.isReserved(RPI.Reg2)) + MBB.addLiveIn(RPI.Reg2); + } + return true; + } for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { RegPairInfo RPI = *RPII; @@ -2360,6 +2449,14 @@ for (const RegPairInfo &RPI : reverse(RegPairs)) if (!RPI.isScalable()) EmitMI(RPI); + } else if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Implicit | RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Implicit | RegState::Define); + } + return true; } else for (const RegPairInfo &RPI : RegPairs) if (!RPI.isScalable()) @@ -2429,7 +2526,7 @@ // MachO's compact unwind format relies on all registers being stored in // pairs. // FIXME: the usual format is actually better if unwinding isn't needed. - if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister && + if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister && !SavedRegs.test(PairedReg)) { SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && @@ -2508,7 +2605,7 @@ // MachO's compact unwind format relies on all registers being stored in // pairs, so if we need to spill one extra for BigStack, then we need to // store the pair. - if (produceCompactUnwindFrame(MF)) + if (producePairRegisters(MF)) SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = UnspilledCSGPR; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3779,6 +3779,11 @@ Sched<[]>; } +// Pseudo instructions for homogeneous prolog/epilog +let isPseudo = 1 in { + def HOM_Prolog : Pseudo<(outs), (ins), []>, Sched<[]>; + def HOM_Epilog : Pseudo<(outs), (ins), []>, Sched<[]>; +} //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -0,0 +1,658 @@ +//===- AArch64LowerHomogeneousPrologEpilog.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that lowers homogeneous prolog/epilog instructions. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64InstPrinter.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" +#include <sstream> + +using namespace llvm; + +#define AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME \ + "AArch64 homogeneous prolog/epilog lowering pass" + +cl::opt<int> FrameHelperSizeThreshold( + "frame-helper-size-threshold", cl::init(2), cl::Hidden, + cl::desc("The minimum number of instructions that are outlined in a frame " + "helper (default = 2)")); + +namespace { + +class AArch64LowerHomogeneousPE { +public: + const AArch64InstrInfo *TII; + + AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI) + : M(M), MMI(MMI) {} + + bool run(); + bool runOnMachineFunction(MachineFunction &Fn); + +private: + Module *M; + MachineModuleInfo *MMI; + + bool runOnMBB(MachineBasicBlock &MBB); + bool runOnMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + + /// Lower a HOM_Prolog pseudo instruction into a helper call + /// or a sequence of homogeneous stores. + /// When a a fp setup follows, it can be optimized. + bool lowerHOM_Prolog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + /// Lower a HOM_Epilog pseudo instruction into a helper call + /// or a sequence of homogeneous loads. + /// When a return follow, it can be optimized. + bool lowerHOM_Epilog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); +}; + +class AArch64LowerHomogeneousPrologEpilog : public ModulePass { +public: + static char ID; + + AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) { + initializeAArch64LowerHomogeneousPrologEpilogPass( + *PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineModuleInfoWrapperPass>(); + AU.addPreserved<MachineModuleInfoWrapperPass>(); + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { + return AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME; + } +}; + +} // end anonymous namespace + +char AArch64LowerHomogeneousPrologEpilog::ID = 0; + +INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog, + "aarch64-lower-homogeneous-prolog-epilog", + AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false) + +bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + MachineModuleInfo *MMI = + &getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); + return AArch64LowerHomogeneousPE(&M, MMI).run(); +} + +bool AArch64LowerHomogeneousPE::run() { + bool Changed = false; + for (auto &F : *M) { + if (F.empty()) + continue; + + MachineFunction *MF = MMI->getMachineFunction(F); + if (!MF) + continue; + Changed |= runOnMachineFunction(*MF); + } + + return Changed; +} +enum FrameHelperType { Prolog, PrologFrame, Epilog, EpilogTail }; + +/// Return a frame helper name with the given CSRs and the helper type. +/// For instance, a prolog helper that saves x19 and x20 is named as +/// OUTLINED_FUNCTION_PROLOG_x19x20. +static std::string getFrameHelperName(SmallVectorImpl<unsigned> &Regs, + FrameHelperType Type, unsigned FpOffset) { + std::ostringstream RegStream; + switch (Type) { + case FrameHelperType::Prolog: + RegStream << "OUTLINED_FUNCTION_PROLOG_"; + break; + case FrameHelperType::PrologFrame: + RegStream << "OUTLINED_FUNCTION_PROLOG_FRAME" << FpOffset << "_"; + break; + case FrameHelperType::Epilog: + RegStream << "OUTLINED_FUNCTION_EPILOG_"; + break; + case FrameHelperType::EpilogTail: + RegStream << "OUTLINED_FUNCTION_EPILOG_TAIL_"; + break; + } + + for (auto Reg : Regs) + RegStream << AArch64InstPrinter::getRegisterName(Reg); + + return RegStream.str(); +} + +/// Create a Function for the unique frame helper with the given name. +/// Return a newly created MachineFunction with an empty MachineBasicBlock. +static MachineFunction &createFrameHelperMachineFunction(Module *M, + MachineModuleInfo *MMI, + StringRef Name) { + LLVMContext &C = M->getContext(); + Function *F = M->getFunction(Name); + assert(F == nullptr && "Function has been created before"); + F = Function::Create(FunctionType::get(Type::getVoidTy(C), false), + Function::ExternalLinkage, Name, M); + assert(F && "Function was null!"); + + // Use ODR linkage to avoid duplication. + F->setLinkage(GlobalValue::LinkOnceODRLinkage); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // Set no-opt/minsize, so we don't insert padding between outlined + // functions. + F->addFnAttr(Attribute::OptimizeNone); + F->addFnAttr(Attribute::NoInline); + F->addFnAttr(Attribute::MinSize); + F->addFnAttr(Attribute::Naked); + + MachineFunction &MF = MMI->getOrCreateMachineFunction(*F); + // Remove unnecessary register liveness and set NoVRegs. + MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness); + MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA); + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + MF.getRegInfo().freezeReservedRegs(MF); + + // Create entry block. + BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); + IRBuilder<> Builder(EntryBB); + Builder.CreateRetVoid(); + + // Insert the new block into the function. + MachineBasicBlock *MBB = MF.CreateMachineBasicBlock(); + MF.insert(MF.begin(), MBB); + + return MF; +} + +/// Emit a homogeneous store-pair instruction for frame-setup. +static void emitHomogeneousStore(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator Pos, + const TargetInstrInfo &TII, unsigned Reg1, + unsigned Reg2) { + bool IsFloat = AArch64::FPR64RegClass.contains(Reg1); + assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2))); + int Opc = IsFloat ? AArch64::STPDpre : AArch64::STPXpre; + MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc)); + MIB.addDef(AArch64::SP) + .addReg(Reg2) + .addReg(Reg1) + .addReg(AArch64::SP) + .addImm(-2) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF), + MachineMemOperand::MOStore, 8, Align(8))) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF), + MachineMemOperand::MOStore, 8, Align(8))) + .setMIFlag(MachineInstr::FrameSetup); +} + +/// Emit a homogeneous load-pair instruction for frame-destroy. +static void emitHomogeneousLoad(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator Pos, + const TargetInstrInfo &TII, unsigned Reg1, + unsigned Reg2) { + bool IsFloat = AArch64::FPR64RegClass.contains(Reg1); + assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2))); + int Opc = IsFloat ? AArch64::LDPDpost : AArch64::LDPXpost; + MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc)); + MIB.addDef(AArch64::SP) + .addReg(Reg2) + .addReg(Reg1) + .addReg(AArch64::SP) + .addImm(2) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF), + MachineMemOperand::MOLoad, 8, Align(8))) + .addMemOperand( + MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF), + MachineMemOperand::MOLoad, 8, Align(8))) + .setMIFlag(MachineInstr::FrameDestroy); +} + +/// Return a unique function if a helper can be formed with the given Regs +/// and frame type. +/// 1) _OUTLINED_FUNCTION_PROLOG_x19x20x21x22: +/// stp x20, x19, [sp, #-16]! +/// stp x22, x21, [sp, #-16]! +/// ret +/// +/// 2) _OUTLINED_FUNCTION_PROLOG_x19x20x30x29x21x22: +/// mov x16, x30 +/// ldp x29, x30, [sp], #16 ; Restore x29/x30 stored at the caller +/// stp x20, x19, [sp, #-16]! +/// stp x29, x30, [sp, #-16]! ; Save x29/30 (NeedSaveLR = true) +/// stp x22, x21, [sp, #-16]! +/// br x16 +/// +/// 3) _OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x21x22: +/// stp x20, x19, [sp, #-16]! +/// stp x22, x21, [sp, #-16]! +/// add fp, sp, #32 +/// ret +/// +/// 4) _OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29x21x22: +/// mov x16, x30 +/// ldp x29, x30, [sp], #16 ; Restore x29/x30 stored at the caller +/// stp x20, x19, [sp, #-16]! +/// stp x29, x30, [sp, #-16]! ; Save x29/30 (NeedSaveLR = true) +/// stp x22, x21, [sp, #-16]! +/// mov fp, sp +/// br x16 +/// +/// 5) _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22: +/// mov x16, x30 +/// ldp x22, x21, [sp], #16 +/// ldp x20, x19, [sp], #16 +/// ldp x29, x30, [sp], #16 +/// br x16 +/// +/// 6) _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22: +/// ldp x22, x21, [sp], #16 +/// ldp x20, x19, [sp], #16 +/// ldp x29, x30, [sp], #16 +/// ret +/// @param M module +/// @param MMI machine module info +/// @param Regs callee save regs that the helper will handle +/// @param Type frame helper type +/// @return a helper function +static Function *getOrCreateFrameHelper(Module *M, MachineModuleInfo *MMI, + SmallVectorImpl<unsigned> &Regs, + FrameHelperType Type, + unsigned FpOffset = 0) { + assert(Regs.size() >= 2); + bool NeedSaveLR = false; + if (Type == FrameHelperType::Prolog || Type == FrameHelperType::PrologFrame) { + // When FP/LR is the first pair, it has been already saved in the caller. + NeedSaveLR = Regs[0] != AArch64::LR; + if (!NeedSaveLR) { + // Prolog helpers do not need to store FP/LR + Regs.erase(Regs.begin()); + Regs.erase(Regs.begin()); + } + } + + auto Name = getFrameHelperName(Regs, Type, FpOffset); + auto F = M->getFunction(Name); + if (F) + return F; + + auto &MF = createFrameHelperMachineFunction(M, MMI, Name); + MachineBasicBlock &MBB = *MF.begin(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + if (NeedSaveLR) { + // Stash LR to X16 + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs)) + .addDef(AArch64::X16) + .addReg(AArch64::XZR) + .addUse(AArch64::LR) + .addImm(0); + // Restore FP/LR from the stack + emitHomogeneousLoad(MF, MBB, MBB.end(), TII, AArch64::LR, AArch64::FP); + } + + int Size = (int)Regs.size(); + switch (Type) { + case FrameHelperType::Prolog: + for (int I = 0; I < Size; I += 2) + emitHomogeneousStore(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1]); + if (NeedSaveLR) + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR)) + .addUse(AArch64::X16); + else + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET)) + .addReg(AArch64::LR, RegState::Undef); + break; + + case FrameHelperType::PrologFrame: + for (int I = 0; I < Size; I += 2) + emitHomogeneousStore(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1]); + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(AArch64::FP) + .addUse(AArch64::SP) + .addImm(FpOffset) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + if (NeedSaveLR) + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR)) + .addUse(AArch64::X16); + else + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET)) + .addReg(AArch64::LR, RegState::Undef); + break; + + case FrameHelperType::Epilog: + // Stash LR to X16 + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs)) + .addDef(AArch64::X16) + .addReg(AArch64::XZR) + .addUse(AArch64::LR) + .addImm(0); + // Restore CSRs in the reverse order + for (int I = Size - 1; I >= 0; I -= 2) + emitHomogeneousLoad(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I]); + // Branch on X16 not to trash LR. + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR)) + .addUse(AArch64::X16); + break; + + case FrameHelperType::EpilogTail: + // Restore CSRs in the reverse order + for (int I = Size - 1; I >= 0; I -= 2) + emitHomogeneousLoad(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I]); + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET)) + .addReg(AArch64::LR, RegState::Undef); + break; + } + + return M->getFunction(Name); +} + +/// Get a valid non-negative adjustment to set fp from sp. +/// @param MBBI instruciton setting fp from sp. +/// @return a valid non-negative adjustment. Or -1 for any other case. +int getFpAdjustmentFromSp(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + if (!MI.getFlag(MachineInstr::FrameSetup)) + return -1; + unsigned Opcode = MI.getOpcode(); + if (Opcode != AArch64::ADDXri && Opcode != AArch64::SUBXri) + return -1; + if (!MI.getOperand(0).isReg()) + return -1; + if (MI.getOperand(0).getReg() != AArch64::FP) + return -1; + if (!MI.getOperand(1).isReg()) + return -1; + if (MI.getOperand(1).getReg() != AArch64::SP) + return -1; + + int Imm = MI.getOperand(2).getImm(); + if (Opcode == AArch64::ADDXri && Imm >= 0) + return Imm; + else if (Opcode == AArch64::SUBXri && Imm <= 0) + return -Imm; + + return -1; +} + +/// This function checks if a frame helper should be used for +/// HOM_Prolog/HOM_Epilog pseudo instruction expansion. +/// @param MBB machine basic block +/// @param NextMBBI next instruction following HOM_Prolog/HOM_Epilog +/// @param Regs callee save registers that are saved or restored. +/// @param Type frame helper type +/// @return True if a use of helper is qualified. +static bool shouldUseFrameHelper(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &NextMBBI, + SmallVectorImpl<unsigned> &Regs, + FrameHelperType Type) { + int RegCount = (int)Regs.size(); + assert(RegCount > 0 && (RegCount % 2 == 0)); + // # of instructions that will be outlined. + int InstCount = RegCount >> 1; + + // Do not use a helper call when not saving LR. + if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end()) + return false; + + switch (Type) { + case FrameHelperType::Prolog: + // Prolog helper cannot save FP/LR. + InstCount--; + break; + case FrameHelperType::PrologFrame: { + // Prolog helper cannot save FP/LR. + // Check if the following instruction is beneficial to be included. + if (NextMBBI == MBB.end()) + return false; + int FpAdjustment = getFpAdjustmentFromSp(NextMBBI); + if (FpAdjustment == -1) + return false; + // Effecitvely no change in InstCount since FpAdjusment is included. + break; + } + case FrameHelperType::Epilog: + // No change in InstCount for the regular epilog case. + break; + case FrameHelperType::EpilogTail: { + // EpilogTail helper includes the caller's return. + if (NextMBBI == MBB.end()) + return false; + if (NextMBBI->getOpcode() != AArch64::RET_ReallyLR) + return false; + InstCount++; + break; + } + } + + return InstCount >= FrameHelperSizeThreshold; +} + +/// Lower a HOM_Epilog pseudo instruction into a helper call while +/// creating the helper on demand. Or emit a sequence of homogeneous loads in +/// place when not using a helper call. +/// +/// 1. With a helper including ret +/// HOM_Epilog x30, x29, x19, x20, x21, x22 ; MBBI +/// ret ; NextMBBI +/// => +/// b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22 +/// ... ; NextMBBI +/// +/// 2. With a helper +/// HOM_Epilog x30, x29, x19, x20, x21, x22 +/// => +/// bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22 +/// +/// 3. Without a helper +/// HOM_Epilog x30, x29, x19, x20, x21, x22 +/// => +/// ldp x22, x21, [sp], #16 +/// ldp x20, x19, [sp], #16 +/// ldp x29, x30, [sp], #16 +bool AArch64LowerHomogeneousPE::lowerHOM_Epilog( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + auto &MF = *MBB.getParent(); + MachineInstr &MI = *MBBI; + + DebugLoc DL = MI.getDebugLoc(); + SmallVector<unsigned, 8> Regs; + for (auto &MO : MI.implicit_operands()) + if (MO.isReg()) + Regs.push_back(MO.getReg()); + int Size = (int)Regs.size(); + if (Size == 0) + return false; + // Registers are in pair. + assert(Size % 2 == 0); + assert(MI.getOpcode() == AArch64::HOM_Epilog); + + auto Return = NextMBBI; + if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::EpilogTail)) { + // When MBB ends with a return, emit a tail-call to the epilog helper + auto EpilogTailHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::EpilogTail); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::TCRETURNdi)) + .addGlobalAddress(EpilogTailHelper) + .addImm(0) + .setMIFlag(MachineInstr::FrameDestroy) + .copyImplicitOps(MI) + .copyImplicitOps(*Return); + NextMBBI = std::next(Return); + Return->removeFromParent(); + } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs, + FrameHelperType::Epilog)) { + // The default epilog helper case. + auto EpilogHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Epilog); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(EpilogHelper) + .setMIFlag(MachineInstr::FrameDestroy) + .copyImplicitOps(MI); + } else { + // Fall back to no-helper. + for (int I = Size - 1; I >= 0; I -= 2) + emitHomogeneousLoad(MF, MBB, MBBI, *TII, Regs[I - 1], Regs[I]); + } + + MBBI->removeFromParent(); + return true; +} + +/// Lower a HOM_Prolog pseudo instruction into a helper call while +/// creating the helper on demand. Or emit a sequence of homogeneous stores in +/// place when not using a helper call. +/// +/// 1. With a helper including frame-setup +/// HOM_Prolog x30, x29, x19, x20, x21, x22 ; MBBI +/// add x29, x30, #32 ; NextMBBI +/// => +/// stp x29, x30, [sp, #-16]! +/// bl _OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x21x22 +/// ... ; NextMBBI +/// +/// 2. With a helper +/// HOM_Prolog x30, x29, x19, x20, x21, x22 +/// => +/// stp x29, x30, [sp, #-16]! +/// bl _OUTLINED_FUNCTION_PROLOG_x19x20x21x22 +/// +/// 3. Without a helper +/// HOM_Prolog x30, x29, x19, x20, x21, x22 +/// => +/// stp x29, x30, [sp, #-16]! +/// stp x20, x19, [sp, #-16]! +/// stp x22, x21, [sp, #-16]! +bool AArch64LowerHomogeneousPE::lowerHOM_Prolog( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + auto &MF = *MBB.getParent(); + MachineInstr &MI = *MBBI; + + DebugLoc DL = MI.getDebugLoc(); + SmallVector<unsigned, 8> Regs; + for (auto &MO : MI.implicit_operands()) + if (MO.isReg()) + Regs.push_back(MO.getReg()); + int Size = (int)Regs.size(); + if (Size == 0) + return false; + // Allow compact unwind case only for oww. + assert(Size % 2 == 0); + assert(MI.getOpcode() == AArch64::HOM_Prolog); + + auto FpAdjustment = NextMBBI; + if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::PrologFrame)) { + // FP/LR is stored at the top of stack before the prolog helper call. + emitHomogeneousStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP); + auto FpOffset = getFpAdjustmentFromSp(NextMBBI); + auto PrologFrameHelper = getOrCreateFrameHelper( + M, MMI, Regs, FrameHelperType::PrologFrame, FpOffset); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(PrologFrameHelper) + .setMIFlag(MachineInstr::FrameSetup) + .copyImplicitOps(MI) + .copyImplicitOps(*FpAdjustment) + .addReg(AArch64::FP, RegState::Implicit | RegState::Define) + .addReg(AArch64::SP, RegState::Implicit); + NextMBBI = std::next(FpAdjustment); + FpAdjustment->removeFromParent(); + } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs, + FrameHelperType::Prolog)) { + // FP/LR is stored at the top of stack before the prolog helper call. + emitHomogeneousStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP); + auto PrologHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Prolog); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(PrologHelper) + .setMIFlag(MachineInstr::FrameSetup) + .copyImplicitOps(MI); + } else { + // Fall back to no-helper. + for (int I = 0; I < Size; I += 2) + emitHomogeneousStore(MF, MBB, MBBI, *TII, Regs[I], Regs[I + 1]); + } + + MBBI->removeFromParent(); + return true; +} + +/// Process each machine instruction +/// @param MBB machine basic block +/// @param MBBI current instruction iterator +/// @param NextMBBIT next instruction iterator which can be updated +/// @return True when IR is changed. +bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case AArch64::HOM_Prolog: + return lowerHOM_Prolog(MBB, MBBI, NextMBBI); + case AArch64::HOM_Epilog: + return lowerHOM_Epilog(MBB, MBBI, NextMBBI); + } + return false; +} + +bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= runOnMI(MBB, MBBI, NMBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) + Modified |= runOnMBB(MBB); + return Modified; +} + +ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() { + return new AArch64LowerHomogeneousPrologEpilog(); +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -161,6 +161,8 @@ cl::desc("Enable the AAcrh64 branch target pass"), cl::init(true)); +extern cl::opt<bool> EnableHomogeneousPrologEpilog; + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); @@ -195,6 +197,7 @@ initializeAArch64SLSHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); + initializeAArch64LowerHomogeneousPrologEpilogPass(*PR); } //===----------------------------------------------------------------------===// @@ -621,6 +624,9 @@ } void AArch64PassConfig::addPreSched2() { + // Lower homogeneous frame instructions + if (EnableHomogeneousPrologEpilog) + addPass(createAArch64LowerHomogeneousPrologEpilogPass()); // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -53,6 +53,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MacroFusion.cpp AArch64MCInstLower.cpp diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-SAVELR + +; CHECK-LABEL: __Z3foofffi: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_FRAME48_x19x20d8d9d10d11 +; CHECK: bl __Z3goof +; CHECK: bl __Z3goof +; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11 + +; CHECK-SAVELR-LABEL: _Z3foofffi: +; CHECK-SAVELR: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR-NEXT: bl OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29d8d9d10d11 +; CHECK-SAVELR: bl _Z3goof +; CHECK-SAVELR: bl _Z3goof +; CHECK-SAVELR: b OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11 + +define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) ssp optsize "frame-pointer"="non-leaf" { +entry: + %inc = fadd float %b, 1.000000e+00 + %add = fadd float %inc, %x + %add1 = fadd float %add, %y + %conv = sitofp i32 %z to float + %sub = fsub float %add1, %conv + %dec = add nsw i32 %z, -1 + %call = tail call float @_Z3goof(float %inc) #2 + %call2 = tail call float @_Z3goof(float %sub) #2 + %add3 = fadd float %call, %call2 + %mul = fmul float %inc, %add3 + %add4 = fadd float %sub, %mul + %conv5 = sitofp i32 %dec to float + %sub6 = fsub float %add4, %conv5 + ret float %sub6 +} + +; CHECK-LABEL: _Z3zoov: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: bl __Z3hoo +; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29 + +define i32 @_Z3zoov() nounwind ssp optsize { + %1 = tail call i32 @_Z3hoov() #2 + %2 = add nsw i32 %1, 1 + ret i32 %2 +} + + +declare float @_Z3goof(float) nounwind ssp optsize +declare i32 @_Z3hoov() nounwind ssp optsize + +; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_FRAME48_x19x20d8d9d10d11: +; CHECK: stp x20, x19, [sp, #-16]! +; CHECK-NEXT: stp d9, d8, [sp, #-16]! +; CHECK-NEXT: stp d11, d10, [sp, #-16]! +; CHECK-NEXT: add x29, sp, #48 +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11: +; CHECK: ldp d11, d10, [sp], #16 +; CHECK-NEXT: ldp d9, d8, [sp], #16 +; CHECK-NEXT: ldp x20, x19, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29: +; CHECK: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29d8d9d10d11: +; CHECK-SAVELR: mov x16, x30 +; CHECK-SAVELR-NEXT: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: stp x20, x19, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp d9, d8, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp d11, d10, [sp, #-16]! +; CHECK-SAVELR-NEXT: mov x29, sp +; CHECK-SAVELR-NEXT: br x16 + +; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11: +; CHECK-SAVELR: ldp d11, d10, [sp], #16 +; CHECK-SAVELR-NEXT: ldp d9, d8, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x20, x19, [sp], #16 +; CHECK-SAVELR-NEXT: ret + +; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x30x29: +; CHECK-SAVELR: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s --check-prefixes=CHECK-SAVELR + +; CHECK-LABEL: __Z3foofffi: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: stp x20, x19, [sp, #-16]! +; CHECK-NEXT: stp d9, d8, [sp, #-16]! +; CHECK-NEXT: stp d11, d10, [sp, #-16]! +; CHECK-NEXT: add x29, sp, #48 +; CHECK: bl __Z3goof +; CHECK: bl __Z3goof +; CHECK: ldp d11, d10, [sp], #16 +; CHECK-NEXT: ldp d9, d8, [sp], #16 +; CHECK-NEXT: ldp x20, x19, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-SAVELR-LABEL: _Z3foofffi: +; CHECK-SAVELR: stp x20, x19, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp d9, d8, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp d11, d10, [sp, #-16]! +; CHECK-SAVELR-NEXT: mov x29, sp +; CHECK-SAVELR: bl _Z3goof +; CHECK-SAVELR: bl _Z3goof +; CHECK-SAVELR: ldp d11, d10, [sp], #16 +; CHECK-SAVELR-NEXT: ldp d9, d8, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x20, x19, [sp], #16 +; CHECK-SAVELR-NEXT: ret + +define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp optsize "frame-pointer"="non-leaf" { +entry: + %inc = fadd float %b, 1.000000e+00 + %add = fadd float %inc, %x + %add1 = fadd float %add, %y + %conv = sitofp i32 %z to float + %sub = fsub float %add1, %conv + %dec = add nsw i32 %z, -1 + %call = tail call float @_Z3goof(float %inc) #2 + %call2 = tail call float @_Z3goof(float %sub) #2 + %add3 = fadd float %call, %call2 + %mul = fmul float %inc, %add3 + %add4 = fadd float %sub, %mul + %conv5 = sitofp i32 %dec to float + %sub6 = fsub float %add4, %conv5 + ret float %sub6 +} + +; CHECK-LABEL: __Z3zoov: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: bl __Z3hoo +; CHECK: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-SAVELR-LABEL: _Z3zoov: +; CHECK-SAVELR: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR: bl _Z3hoo +; CHECK-SAVELR: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: ret + +define i32 @_Z3zoov() nounwind ssp optsize { + %1 = tail call i32 @_Z3hoov() #2 + %2 = add nsw i32 %1, 1 + ret i32 %2 +} + + +declare float @_Z3goof(float) nounwind ssp optsize +declare i32 @_Z3hoov() nounwind ssp optsize diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog| FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-SAVELR + +; CHECK-LABEL: __Z3hooii: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_x19x20x21x22 +; CHECK: bl __Z3gooi +; CHECK: bl __Z3gooi +; CHECK: bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22 +; CHECK-NEXT: b __Z3gooi + +; CHECK-SAVELR-LABEL: _Z3hooii: +; CHECK-SAVELR: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR-NEXT: bl OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29 +; CHECK-SAVELR: bl _Z3gooi +; CHECK-SAVELR: bl _Z3gooi +; CHECK-SAVELR: bl OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29 +; CHECK-SAVELR-NEXT: b _Z3gooi + +define i32 @_Z3hooii(i32 %b, i32 %a) nounwind ssp optsize { + %1 = tail call i32 @_Z3gooi(i32 %b) + %2 = tail call i32 @_Z3gooi(i32 %a) + %3 = add i32 %a, %b + %4 = add i32 %3, %1 + %5 = add i32 %4, %2 + %6 = tail call i32 @_Z3gooi(i32 %5) + ret i32 %6 +} + +declare i32 @_Z3gooi(i32); + + +; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_x19x20x21x22: +; CHECK: stp x20, x19, [sp, #-16]! +; CHECK-NEXT: stp x22, x21, [sp, #-16]! +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22: +; CHECK: mov x16, x30 +; CHECK-NEXT: ldp x22, x21, [sp], #16 +; CHECK-NEXT: ldp x20, x19, [sp], #16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 +; CHECK-NEXT: br x16 + +; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29: +; CHECK-SAVELR: mov x16, x30 +; CHECK-SAVELR-NEXT: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: stp x20, x19, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp x22, x21, [sp, #-16]! +; CHECK-SAVELR-NEXT: stp x29, x30, [sp, #-16]! +; CHECK-SAVELR-NEXT: br x16 + +; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29: +; CHECK-SAVELR: mov x16, x30 +; CHECK-SAVELR-NEXT: ldp x29, x30, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x22, x21, [sp], #16 +; CHECK-SAVELR-NEXT: ldp x20, x19, [sp], #16 +; CHECK-SAVELR-NEXT: br x16