diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -42,6 +42,7 @@ FunctionPass *createAArch64IndirectThunks(); FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +ModulePass *createAArch64LowerHomogeneousPrologEpilogPass(); FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); @@ -79,6 +80,7 @@ void initializeAArch64SLSHardeningPass(PassRegistry&); void initializeAArch64SpeculationHardeningPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PreLegalizerCombinerPass(PassRegistry&); void initializeAArch64PostLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -124,6 +124,16 @@ SmallVectorImpl &ObjectsToAllocate) const override; private: + /// Returns true if a homogeneous prolog or epilog code can be emitted + /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo + /// instructions are emitted in place. When Exit block is given, this check is + /// for epilog. + bool homogeneousPrologEpilog(MachineFunction &MF, + MachineBasicBlock *Exit = nullptr) const; + + /// Returns true if CSRs should be paired. + bool producePairRegisters(MachineFunction &MF) const; + bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -179,6 +179,11 @@ cl::desc("sort stack allocations"), cl::init(true), cl::Hidden); +cl::opt EnableHomogeneousPrologEpilog( + "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden, + cl::desc("Emit homogeneous prologue and epilogue for the size " + "optimization (default = off)")); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns the argument pop size. @@ -213,6 +218,47 @@ return ArgumentPopSize; } +static bool produceCompactUnwindFrame(MachineFunction &MF); +static bool needsWinCFI(const MachineFunction &MF); +static StackOffset getSVEStackSize(const MachineFunction &MF); + +/// Returns true if a homogeneous prolog or epilog code can be emitted +/// for the size optimization. If possible, a frame helper call is injected. +/// When Exit block is given, this check is for epilog. +bool AArch64FrameLowering::homogeneousPrologEpilog( + MachineFunction &MF, MachineBasicBlock *Exit) const { + if (!MF.getFunction().hasMinSize()) + return false; + if (!EnableHomogeneousPrologEpilog) + return false; + if (ReverseCSRRestoreSeq) + return false; + if (EnableRedZone) + return false; + + // TODO: Window is supported yet. + if (needsWinCFI(MF)) + return false; + // TODO: SVE is not supported yet. + if (getSVEStackSize(MF)) + return false; + + // Bail on stack adjustment needed on return for simplicity. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)) + return false; + if (Exit && getArgumentPopSize(MF, *Exit)) + return false; + + return true; +} + +/// Returns true if CSRs should be paired. +bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { + return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF); +} + /// This is the biggest offset to the stack pointer we can encode in aarch64 /// instructions (without using a separate calculation and a temp register). /// Note that the exception here are vector stores/loads which cannot encode any @@ -605,6 +651,8 @@ const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (homogeneousPrologEpilog(MF)) + return false; if (AFI->getLocalStackSize() == 0) return false; @@ -1148,12 +1196,16 @@ // All of the remaining stack allocations are for locals. AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool HomPrologEpilog = homogeneousPrologEpilog(MF); if (CombineSPBump) { assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); NumBytes = 0; + } else if (HomPrologEpilog) { + // Stack has been already adjusted. + NumBytes -= PrologueSaveSize; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); @@ -1181,13 +1233,20 @@ if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); - // Issue sub fp, sp, FPOffset or - // mov fp,sp when FPOffset is zero. - // Note: All stores of callee-saved registers are marked as "FrameSetup". - // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, - StackOffset::getFixed(FPOffset), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + if (HomPrologEpilog) { + auto Prolog = MBBI; + --Prolog; + assert(Prolog->getOpcode() == AArch64::HOM_Prolog); + Prolog->addOperand(MachineOperand::CreateImm(FPOffset)); + } else { + // Issue sub fp, sp, FPOffset or + // mov fp,sp when FPOffset is zero. + // Note: All stores of callee-saved registers are marked as "FrameSetup". + // This code marks the instruction(s) that set the FP also. + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + StackOffset::getFixed(FPOffset), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + } } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1615,6 +1674,25 @@ // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); + if (homogeneousPrologEpilog(MF, &MBB)) { + assert(!NeedsWinCFI); + auto LastPopI = MBB.getFirstTerminator(); + if (LastPopI != MBB.begin()) { + auto HomogeneousEpilog = std::prev(LastPopI); + if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog) + LastPopI = HomogeneousEpilog; + } + + // Adjust local stack + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(-AFI->getLocalStackSize()), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI); + + // SP has been already adjusted while restoring callee save regs. + // We've bailed-out the case with adjusting SP for arguments. + assert(AfterCSRPopSize == 0); + return; + } bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. @@ -2333,6 +2411,22 @@ MBB.addLiveIn(AArch64::X18); } + if (homogeneousPrologEpilog(MF)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) + .setMIFlag(MachineInstr::FrameSetup); + + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1); + MIB.addReg(RPI.Reg2); + + // Update register live in. + if (!MRI.isReserved(RPI.Reg1)) + MBB.addLiveIn(RPI.Reg1); + if (!MRI.isReserved(RPI.Reg2)) + MBB.addLiveIn(RPI.Reg2); + } + return true; + } for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { RegPairInfo RPI = *RPII; @@ -2528,6 +2622,14 @@ for (const RegPairInfo &RPI : reverse(RegPairs)) if (!RPI.isScalable()) EmitMI(RPI); + } else if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Define); + } + return true; } else for (const RegPairInfo &RPI : RegPairs) if (!RPI.isScalable()) @@ -2597,7 +2699,7 @@ // MachO's compact unwind format relies on all registers being stored in // pairs. // FIXME: the usual format is actually better if unwinding isn't needed. - if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister && + if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister && !SavedRegs.test(PairedReg)) { SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && @@ -2676,7 +2778,7 @@ // MachO's compact unwind format relies on all registers being stored in // pairs, so if we need to spill one extra for BigStack, then we need to // store the pair. - if (produceCompactUnwindFrame(MF)) + if (producePairRegisters(MF)) SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = UnspilledCSGPR; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -3896,6 +3896,14 @@ Sched<[]>; } +// Pseudo instructions for homogeneous prolog/epilog +let isPseudo = 1 in { + // Save CSRs in order, {FPOffset} + def HOM_Prolog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>; + // Restore CSRs in order + def HOM_Epilog : Pseudo<(outs), (ins variable_ops), []>, Sched<[]>; +} + //===----------------------------------------------------------------------===// // Floating point immediate move. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -0,0 +1,613 @@ +//===- AArch64LowerHomogeneousPrologEpilog.cpp ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that lowers homogeneous prolog/epilog instructions. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64InstPrinter.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME \ + "AArch64 homogeneous prolog/epilog lowering pass" + +cl::opt FrameHelperSizeThreshold( + "frame-helper-size-threshold", cl::init(2), cl::Hidden, + cl::desc("The minimum number of instructions that are outlined in a frame " + "helper (default = 2)")); + +namespace { + +class AArch64LowerHomogeneousPE { +public: + const AArch64InstrInfo *TII; + + AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI) + : M(M), MMI(MMI) {} + + bool run(); + bool runOnMachineFunction(MachineFunction &Fn); + +private: + Module *M; + MachineModuleInfo *MMI; + + bool runOnMBB(MachineBasicBlock &MBB); + bool runOnMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + + /// Lower a HOM_Prolog pseudo instruction into a helper call + /// or a sequence of homogeneous stores. + /// When a a fp setup follows, it can be optimized. + bool lowerProlog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + /// Lower a HOM_Epilog pseudo instruction into a helper call + /// or a sequence of homogeneous loads. + /// When a return follow, it can be optimized. + bool lowerEpilog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); +}; + +class AArch64LowerHomogeneousPrologEpilog : public ModulePass { +public: + static char ID; + + AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) { + initializeAArch64LowerHomogeneousPrologEpilogPass( + *PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } + bool runOnModule(Module &M) override; + + StringRef getPassName() const override { + return AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME; + } +}; + +} // end anonymous namespace + +char AArch64LowerHomogeneousPrologEpilog::ID = 0; + +INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog, + "aarch64-lower-homogeneous-prolog-epilog", + AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false) + +bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + MachineModuleInfo *MMI = + &getAnalysis().getMMI(); + return AArch64LowerHomogeneousPE(&M, MMI).run(); +} + +bool AArch64LowerHomogeneousPE::run() { + bool Changed = false; + for (auto &F : *M) { + if (F.empty()) + continue; + + MachineFunction *MF = MMI->getMachineFunction(F); + if (!MF) + continue; + Changed |= runOnMachineFunction(*MF); + } + + return Changed; +} +enum FrameHelperType { Prolog, PrologFrame, Epilog, EpilogTail }; + +/// Return a frame helper name with the given CSRs and the helper type. +/// For instance, a prolog helper that saves x19 and x20 is named as +/// OUTLINED_FUNCTION_PROLOG_x19x20. +static std::string getFrameHelperName(SmallVectorImpl &Regs, + FrameHelperType Type, unsigned FpOffset) { + std::ostringstream RegStream; + switch (Type) { + case FrameHelperType::Prolog: + RegStream << "OUTLINED_FUNCTION_PROLOG_"; + break; + case FrameHelperType::PrologFrame: + RegStream << "OUTLINED_FUNCTION_PROLOG_FRAME" << FpOffset << "_"; + break; + case FrameHelperType::Epilog: + RegStream << "OUTLINED_FUNCTION_EPILOG_"; + break; + case FrameHelperType::EpilogTail: + RegStream << "OUTLINED_FUNCTION_EPILOG_TAIL_"; + break; + } + + for (auto Reg : Regs) + RegStream << AArch64InstPrinter::getRegisterName(Reg); + + return RegStream.str(); +} + +/// Create a Function for the unique frame helper with the given name. +/// Return a newly created MachineFunction with an empty MachineBasicBlock. +static MachineFunction &createFrameHelperMachineFunction(Module *M, + MachineModuleInfo *MMI, + StringRef Name) { + LLVMContext &C = M->getContext(); + Function *F = M->getFunction(Name); + assert(F == nullptr && "Function has been created before"); + F = Function::Create(FunctionType::get(Type::getVoidTy(C), false), + Function::ExternalLinkage, Name, M); + assert(F && "Function was null!"); + + // Use ODR linkage to avoid duplication. + F->setLinkage(GlobalValue::LinkOnceODRLinkage); + F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + // Set no-opt/minsize, so we don't insert padding between outlined + // functions. + F->addFnAttr(Attribute::OptimizeNone); + F->addFnAttr(Attribute::NoInline); + F->addFnAttr(Attribute::MinSize); + F->addFnAttr(Attribute::Naked); + + MachineFunction &MF = MMI->getOrCreateMachineFunction(*F); + // Remove unnecessary register liveness and set NoVRegs. + MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness); + MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA); + MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs); + MF.getRegInfo().freezeReservedRegs(MF); + + // Create entry block. + BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F); + IRBuilder<> Builder(EntryBB); + Builder.CreateRetVoid(); + + // Insert the new block into the function. + MachineBasicBlock *MBB = MF.CreateMachineBasicBlock(); + MF.insert(MF.begin(), MBB); + + return MF; +} + +/// Emit a store-pair instruction for frame-setup. +static void emitStore(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator Pos, + const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2, + int Offset, bool IsPreDec) { + bool IsFloat = AArch64::FPR64RegClass.contains(Reg1); + assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2))); + unsigned Opc; + if (IsPreDec) + Opc = IsFloat ? AArch64::STPDpre : AArch64::STPXpre; + else + Opc = IsFloat ? AArch64::STPDi : AArch64::STPXi; + + MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc)); + if (IsPreDec) + MIB.addDef(AArch64::SP); + MIB.addReg(Reg2) + .addReg(Reg1) + .addReg(AArch64::SP) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameSetup); +} + +/// Emit a load-pair instruction for frame-destroy. +static void emitLoad(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator Pos, + const TargetInstrInfo &TII, unsigned Reg1, unsigned Reg2, + int Offset, bool IsPostDec) { + bool IsFloat = AArch64::FPR64RegClass.contains(Reg1); + assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2))); + unsigned Opc; + if (IsPostDec) + Opc = IsFloat ? AArch64::LDPDpost : AArch64::LDPXpost; + else + Opc = IsFloat ? AArch64::LDPDi : AArch64::LDPXi; + + MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc)); + if (IsPostDec) + MIB.addDef(AArch64::SP); + MIB.addReg(Reg2) + .addReg(Reg1) + .addReg(AArch64::SP) + .addImm(Offset) + .setMIFlag(MachineInstr::FrameDestroy); +} + +/// Return a unique function if a helper can be formed with the given Regs +/// and frame type. +/// 1) _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22: +/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller +/// stp x20, x19, [sp, #16] +/// ret +/// +/// 2) _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22: +/// stp x22, x21, [sp, #-32]! ; x29/x30 has been stored at the caller +/// stp x20, x19, [sp, #16] +/// add fp, sp, #32 +/// ret +/// +/// 3) _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22: +/// mov x16, x30 +/// ldp x29, x30, [sp, #32] +/// ldp x20, x19, [sp, #16] +/// ldp x22, x21, [sp], #48 +/// ret x16 +/// +/// 4) _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22: +/// ldp x29, x30, [sp, #32] +/// ldp x20, x19, [sp, #16] +/// ldp x22, x21, [sp], #48 +/// ret +/// @param M module +/// @param MMI machine module info +/// @param Regs callee save regs that the helper will handle +/// @param Type frame helper type +/// @return a helper function +static Function *getOrCreateFrameHelper(Module *M, MachineModuleInfo *MMI, + SmallVectorImpl &Regs, + FrameHelperType Type, + unsigned FpOffset = 0) { + assert(Regs.size() >= 2); + auto Name = getFrameHelperName(Regs, Type, FpOffset); + auto *F = M->getFunction(Name); + if (F) + return F; + + auto &MF = createFrameHelperMachineFunction(M, MMI, Name); + MachineBasicBlock &MBB = *MF.begin(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + + int Size = (int)Regs.size(); + switch (Type) { + case FrameHelperType::Prolog: + case FrameHelperType::PrologFrame: { + // Compute the remaining SP adjust beyond FP/LR. + auto LRIdx = std::distance( + Regs.begin(), std::find(Regs.begin(), Regs.end(), AArch64::LR)); + + // If the register stored to the lowest address is not LR, we must subtract + // more from SP here. + if (LRIdx != Size - 2) { + assert(Regs[Size - 2] != AArch64::LR); + emitStore(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1], + LRIdx - Size + 2, true); + } + + // Store CSRs in the reverse order. + for (int I = Size - 3; I >= 0; I -= 2) { + // FP/LR has been stored at call-site. + if (Regs[I - 1] == AArch64::LR) + continue; + emitStore(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I], Size - I - 1, + false); + } + if (Type == FrameHelperType::PrologFrame) + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ADDXri)) + .addDef(AArch64::FP) + .addUse(AArch64::SP) + .addImm(FpOffset) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET)) + .addReg(AArch64::LR); + break; + } + case FrameHelperType::Epilog: + case FrameHelperType::EpilogTail: + if (Type == FrameHelperType::Epilog) + // Stash LR to X16 + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs)) + .addDef(AArch64::X16) + .addReg(AArch64::XZR) + .addUse(AArch64::LR) + .addImm(0); + + for (int I = 0; I < Size - 2; I += 2) + emitLoad(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1], Size - I - 2, + false); + // Restore the last CSR with post-increment of SP. + emitLoad(MF, MBB, MBB.end(), TII, Regs[Size - 2], Regs[Size - 1], Size, + true); + + BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET)) + .addReg(Type == FrameHelperType::Epilog ? AArch64::X16 : AArch64::LR); + break; + } + + return M->getFunction(Name); +} + +/// This function checks if a frame helper should be used for +/// HOM_Prolog/HOM_Epilog pseudo instruction expansion. +/// @param MBB machine basic block +/// @param NextMBBI next instruction following HOM_Prolog/HOM_Epilog +/// @param Regs callee save registers that are saved or restored. +/// @param Type frame helper type +/// @return True if a use of helper is qualified. +static bool shouldUseFrameHelper(MachineBasicBlock &MBB, + MachineBasicBlock::iterator &NextMBBI, + SmallVectorImpl &Regs, + FrameHelperType Type) { + const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo(); + auto RegCount = Regs.size(); + assert(RegCount > 0 && (RegCount % 2 == 0)); + // # of instructions that will be outlined. + int InstCount = RegCount / 2; + + // Do not use a helper call when not saving LR. + if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end()) + return false; + + switch (Type) { + case FrameHelperType::Prolog: + // Prolog helper cannot save FP/LR. + InstCount--; + break; + case FrameHelperType::PrologFrame: { + // Effecitvely no change in InstCount since FpAdjusment is included. + break; + } + case FrameHelperType::Epilog: + // Bail-out if X16 is live across the epilog helper because it is used in + // the helper to handle X30. + for (auto NextMI = NextMBBI; NextMI != MBB.end(); NextMI++) { + if (NextMI->readsRegister(AArch64::W16, TRI)) + return false; + } + // Epilog may not be in the last block. Check the liveness in successors. + for (const MachineBasicBlock *SuccMBB : MBB.successors()) { + if (SuccMBB->isLiveIn(AArch64::W16) || SuccMBB->isLiveIn(AArch64::X16)) + return false; + } + // No change in InstCount for the regular epilog case. + break; + case FrameHelperType::EpilogTail: { + // EpilogTail helper includes the caller's return. + if (NextMBBI == MBB.end()) + return false; + if (NextMBBI->getOpcode() != AArch64::RET_ReallyLR) + return false; + InstCount++; + break; + } + } + + return InstCount >= FrameHelperSizeThreshold; +} + +/// Lower a HOM_Epilog pseudo instruction into a helper call while +/// creating the helper on demand. Or emit a sequence of loads in place when not +/// using a helper call. +/// +/// 1. With a helper including ret +/// HOM_Epilog x30, x29, x19, x20, x21, x22 ; MBBI +/// ret ; NextMBBI +/// => +/// b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22 +/// ... ; NextMBBI +/// +/// 2. With a helper +/// HOM_Epilog x30, x29, x19, x20, x21, x22 +/// => +/// bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22 +/// +/// 3. Without a helper +/// HOM_Epilog x30, x29, x19, x20, x21, x22 +/// => +/// ldp x29, x30, [sp, #32] +/// ldp x20, x19, [sp, #16] +/// ldp x22, x21, [sp], #48 +bool AArch64LowerHomogeneousPE::lowerEpilog( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + auto &MF = *MBB.getParent(); + MachineInstr &MI = *MBBI; + + DebugLoc DL = MI.getDebugLoc(); + SmallVector Regs; + for (auto &MO : MI.operands()) + if (MO.isReg()) + Regs.push_back(MO.getReg()); + int Size = (int)Regs.size(); + if (Size == 0) + return false; + // Registers are in pair. + assert(Size % 2 == 0); + assert(MI.getOpcode() == AArch64::HOM_Epilog); + + auto Return = NextMBBI; + if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::EpilogTail)) { + // When MBB ends with a return, emit a tail-call to the epilog helper + auto *EpilogTailHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::EpilogTail); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::TCRETURNdi)) + .addGlobalAddress(EpilogTailHelper) + .addImm(0) + .setMIFlag(MachineInstr::FrameDestroy) + .copyImplicitOps(MI) + .copyImplicitOps(*Return); + NextMBBI = std::next(Return); + Return->removeFromParent(); + } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs, + FrameHelperType::Epilog)) { + // The default epilog helper case. + auto *EpilogHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Epilog); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(EpilogHelper) + .setMIFlag(MachineInstr::FrameDestroy) + .copyImplicitOps(MI); + } else { + // Fall back to no-helper. + for (int I = 0; I < Size - 2; I += 2) + emitLoad(MF, MBB, MBBI, *TII, Regs[I], Regs[I + 1], Size - I - 2, false); + // Restore the last CSR with post-increment of SP. + emitLoad(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], Size, true); + } + + MBBI->removeFromParent(); + return true; +} + +/// Lower a HOM_Prolog pseudo instruction into a helper call while +/// creating the helper on demand. Or emit a sequence of stores in place when +/// not using a helper call. +/// +/// 1. With a helper including frame-setup +/// HOM_Prolog x30, x29, x19, x20, x21, x22, 32 +/// => +/// stp x29, x30, [sp, #-16]! +/// bl _OUTLINED_FUNCTION_PROLOG_FRAME32_x30x29x19x20x21x22 +/// +/// 2. With a helper +/// HOM_Prolog x30, x29, x19, x20, x21, x22 +/// => +/// stp x29, x30, [sp, #-16]! +/// bl _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22 +/// +/// 3. Without a helper +/// HOM_Prolog x30, x29, x19, x20, x21, x22 +/// => +/// stp x22, x21, [sp, #-48]! +/// stp x20, x19, [sp, #16] +/// stp x29, x30, [sp, #32] +bool AArch64LowerHomogeneousPE::lowerProlog( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + auto &MF = *MBB.getParent(); + MachineInstr &MI = *MBBI; + + DebugLoc DL = MI.getDebugLoc(); + SmallVector Regs; + int LRIdx = 0; + Optional FpOffset; + for (auto &MO : MI.operands()) { + if (MO.isReg()) { + if (MO.getReg() == AArch64::LR) + LRIdx = Regs.size(); + Regs.push_back(MO.getReg()); + } else if (MO.isImm()) { + FpOffset = MO.getImm(); + } + } + int Size = (int)Regs.size(); + if (Size == 0) + return false; + // Allow compact unwind case only for oww. + assert(Size % 2 == 0); + assert(MI.getOpcode() == AArch64::HOM_Prolog); + + if (FpOffset && + shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::PrologFrame)) { + // FP/LR is stored at the top of stack before the prolog helper call. + emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true); + auto *PrologFrameHelper = getOrCreateFrameHelper( + M, MMI, Regs, FrameHelperType::PrologFrame, *FpOffset); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(PrologFrameHelper) + .setMIFlag(MachineInstr::FrameSetup) + .copyImplicitOps(MI) + .addReg(AArch64::FP, RegState::Implicit | RegState::Define) + .addReg(AArch64::SP, RegState::Implicit); + } else if (!FpOffset && shouldUseFrameHelper(MBB, NextMBBI, Regs, + FrameHelperType::Prolog)) { + // FP/LR is stored at the top of stack before the prolog helper call. + emitStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP, -LRIdx - 2, true); + auto *PrologHelper = + getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Prolog); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL)) + .addGlobalAddress(PrologHelper) + .setMIFlag(MachineInstr::FrameSetup) + .copyImplicitOps(MI); + } else { + // Fall back to no-helper. + emitStore(MF, MBB, MBBI, *TII, Regs[Size - 2], Regs[Size - 1], -Size, true); + for (int I = Size - 3; I >= 0; I -= 2) + emitStore(MF, MBB, MBBI, *TII, Regs[I - 1], Regs[I], Size - I - 1, false); + if (FpOffset) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri)) + .addDef(AArch64::FP) + .addUse(AArch64::SP) + .addImm(*FpOffset) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } + } + + MBBI->removeFromParent(); + return true; +} + +/// Process each machine instruction +/// @param MBB machine basic block +/// @param MBBI current instruction iterator +/// @param NextMBBIT next instruction iterator which can be updated +/// @return True when IR is changed. +bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case AArch64::HOM_Prolog: + return lowerProlog(MBB, MBBI, NextMBBI); + case AArch64::HOM_Epilog: + return lowerEpilog(MBB, MBBI, NextMBBI); + } + return false; +} + +bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= runOnMI(MBB, MBBI, NMBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getSubtarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) + Modified |= runOnMBB(MBB); + return Modified; +} + +ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() { + return new AArch64LowerHomogeneousPrologEpilog(); +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -161,6 +161,8 @@ cl::desc("Enable the AAcrh64 branch target pass"), cl::init(true)); +extern cl::opt EnableHomogeneousPrologEpilog; + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(getTheAArch64leTarget()); @@ -197,6 +199,7 @@ initializeAArch64SLSHardeningPass(*PR); initializeAArch64StackTaggingPass(*PR); initializeAArch64StackTaggingPreRAPass(*PR); + initializeAArch64LowerHomogeneousPrologEpilogPass(*PR); } //===----------------------------------------------------------------------===// @@ -634,6 +637,9 @@ } void AArch64PassConfig::addPreSched2() { + // Lower homogeneous frame instructions + if (EnableHomogeneousPrologEpilog) + addPass(createAArch64LowerHomogeneousPrologEpilogPass()); // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt --- a/llvm/lib/Target/AArch64/CMakeLists.txt +++ b/llvm/lib/Target/AArch64/CMakeLists.txt @@ -59,6 +59,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64LowerHomogeneousPrologEpilog.cpp AArch64MachineFunctionInfo.cpp AArch64MacroFusion.cpp AArch64MCInstLower.cpp diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-bad-outline.mir b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-bad-outline.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-bad-outline.mir @@ -0,0 +1,40 @@ +# RUN: llc -mtriple=arm64-applie-ios7.0 -start-before=aarch64-lower-homogeneous-prolog-epilog -homogeneous-prolog-epilog %s -o - | FileCheck %s +# +# This test ensure no outlined epilog is formed when X16 is live across the helper. +--- | + @FuncPtr = local_unnamed_addr global i32 (i32)* null, align 8 + + define i32 @_Z3fooi(i32) minsize "frame-pointer"="all" { + ret i32 0 + } + + declare i32 @_Z3gooii(i32, i32) +... +--- +name: _Z3fooi +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0, $lr, $x19, $x20 + successors: %bb.1 + frame-setup HOM_Prolog $lr, $fp, $x19, $x20, 16 + frame-setup CFI_INSTRUCTION def_cfa $w29, 16 + frame-setup CFI_INSTRUCTION offset $w30, -8 + frame-setup CFI_INSTRUCTION offset $w29, -16 + frame-setup CFI_INSTRUCTION offset $w19, -24 + frame-setup CFI_INSTRUCTION offset $w20, -32 + $w19 = nsw ADDWri $w0, 1, 0 + $w1 = ORRWrr $wzr, $w0 + $w0 = ORRWrr $wzr, $w19 + BL @_Z3gooii, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0 + $x8 = ADRP target-flags(aarch64-page) @FuncPtr + $x16 = LDRXui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @FuncPtr + $w0 = nsw ADDWrr renamable $w0, killed renamable $w19 + $lr, $fp, $x19, $x20 = frame-destroy HOM_Epilog + B %bb.1 + + bb.1: + liveins: $w0, $x16 + TCRETURNri killed renamable $x16, 0, csr_aarch64_aapcs, implicit $sp, implicit $w0 +# CHECK: _OUTLINED_FUNCTION_PROLOG_FRAME16_x30x29x19x20: +# CHECK-NOT: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20: diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll @@ -0,0 +1,85 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-LINUX + +; CHECK-LABEL: __Z3foofffi: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_FRAME48_x30x29x19x20d8d9d10d11 +; CHECK: bl __Z3goof +; CHECK: bl __Z3goof +; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11 + +; CHECK-LINUX-LABEL: _Z3foofffi: +; CHECK-LINUX: stp x29, x30, [sp, #-32]! +; CHECK-LINUX-NEXT: bl OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x30x29d8d9d10d11 +; CHECK-LINUX: bl _Z3goof +; CHECK-LINUX: bl _Z3goof +; CHECK-LINUX: b OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11 + +define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) ssp minsize "frame-pointer"="non-leaf" { +entry: + %inc = fadd float %b, 1.000000e+00 + %add = fadd float %inc, %x + %add1 = fadd float %add, %y + %conv = sitofp i32 %z to float + %sub = fsub float %add1, %conv + %dec = add nsw i32 %z, -1 + %call = tail call float @_Z3goof(float %inc) #2 + %call2 = tail call float @_Z3goof(float %sub) #2 + %add3 = fadd float %call, %call2 + %mul = fmul float %inc, %add3 + %add4 = fadd float %sub, %mul + %conv5 = sitofp i32 %dec to float + %sub6 = fsub float %add4, %conv5 + ret float %sub6 +} + +; CHECK-LABEL: _Z3zoov: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: bl __Z3hoo +; CHECK: b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29 + +define i32 @_Z3zoov() nounwind ssp minsize { + %1 = tail call i32 @_Z3hoov() #2 + %2 = add nsw i32 %1, 1 + ret i32 %2 +} + + +declare float @_Z3goof(float) nounwind ssp minsize +declare i32 @_Z3hoov() nounwind ssp optsize + +; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_FRAME48_x30x29x19x20d8d9d10d11: +; CHECK: stp d11, d10, [sp, #-48]! +; CHECK-NEXT: stp d9, d8, [sp, #16] +; CHECK-NEXT: stp x20, x19, [sp, #32] +; CHECK-NEXT: add x29, sp, #48 +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11: +; CHECK: ldp x29, x30, [sp, #48] +; CHECK-NEXT: ldp x20, x19, [sp, #32] +; CHECK-NEXT: ldp d9, d8, [sp, #16] +; CHECK-NEXT: ldp d11, d10, [sp], #64 +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29: +; CHECK: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x30x29d8d9d10d11: +; CHECK-LINUX: stp d11, d10, [sp, #-32]! +; CHECK-LINUX-NEXT: stp d9, d8, [sp, #16] +; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48] +; CHECK-LINUX-NEXT: add x29, sp, #32 +; CHECK-LINUX-NEXT: ret + +; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11: +; CHECK-LINUX: ldp x20, x19, [sp, #48] +; CHECK-LINUX-NEXT: ldp x29, x30, [sp, #32] +; CHECK-LINUX-NEXT: ldp d9, d8, [sp, #16] +; CHECK-LINUX-NEXT: ldp d11, d10, [sp], #64 +; CHECK-LINUX-NEXT: ret + +; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x30x29: +; CHECK-LINUX: ldp x29, x30, [sp], #16 +; CHECK-LINUX-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll @@ -0,0 +1,70 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s --check-prefixes=CHECK-LINUX + +; CHECK-LABEL: __Z3foofffi: +; CHECK: stp d11, d10, [sp, #-64]! +; CHECK-NEXT: stp d9, d8, [sp, #16] +; CHECK-NEXT: stp x20, x19, [sp, #32] +; CHECK-NEXT: stp x29, x30, [sp, #48] +; CHECK-NEXT: add x29, sp, #48 +; CHECK: bl __Z3goof +; CHECK: bl __Z3goof +; CHECK: ldp x29, x30, [sp, #48] +; CHECK: ldp x20, x19, [sp, #32] +; CHECK: ldp d9, d8, [sp, #16] +; CHECK: ldp d11, d10, [sp], #64 +; CHECK: ret + +; CHECK-LINUX-LABEL: _Z3foofffi: +; CHECK-LINUX: stp d11, d10, [sp, #-64]! +; CHECK-LINUX-NEXT: stp d9, d8, [sp, #16] +; CHECK-LINUX-NEXT: stp x29, x30, [sp, #32] +; CHECK-LINUX-NEXT: stp x20, x19, [sp, #48] +; CHECK-LINUX-NEXT: add x29, sp, #32 +; CHECK-LINUX: bl _Z3goof +; CHECK-LINUX: bl _Z3goof +; CHECK-LINUX: ldp x20, x19, [sp, #48] +; CHECK-LINUX: ldp x29, x30, [sp, #32] +; CHECK-LINUX: ldp d9, d8, [sp, #16] +; CHECK-LINUX: ldp d11, d10, [sp], #64 +; CHECK-LINUX: ret + +define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp minsize "frame-pointer"="non-leaf" { +entry: + %inc = fadd float %b, 1.000000e+00 + %add = fadd float %inc, %x + %add1 = fadd float %add, %y + %conv = sitofp i32 %z to float + %sub = fsub float %add1, %conv + %dec = add nsw i32 %z, -1 + %call = tail call float @_Z3goof(float %inc) #2 + %call2 = tail call float @_Z3goof(float %sub) #2 + %add3 = fadd float %call, %call2 + %mul = fmul float %inc, %add3 + %add4 = fadd float %sub, %mul + %conv5 = sitofp i32 %dec to float + %sub6 = fsub float %add4, %conv5 + ret float %sub6 +} + +; CHECK-LABEL: __Z3zoov: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: bl __Z3hoo +; CHECK: ldp x29, x30, [sp], #16 +; CHECK-NEXT: ret + +; CHECK-LINUX-LABEL: _Z3zoov: +; CHECK-LINUX: stp x29, x30, [sp, #-16]! +; CHECK-LINUX: bl _Z3hoo +; CHECK-LINUX: ldp x29, x30, [sp], #16 +; CHECK-LINUX-NEXT: ret + +define i32 @_Z3zoov() nounwind ssp minsize { + %1 = tail call i32 @_Z3hoov() #2 + %2 = add nsw i32 %1, 1 + ret i32 %2 +} + + +declare float @_Z3goof(float) nounwind ssp minsize +declare i32 @_Z3hoov() nounwind ssp minsize diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll @@ -0,0 +1,55 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog| FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-LINUX + +; CHECK-LABEL: __Z3hooii: +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK-NEXT: bl _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22 +; CHECK: bl __Z3gooi +; CHECK: bl __Z3gooi +; CHECK: bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22 +; CHECK-NEXT: b __Z3gooi + +; CHECK-LINUX-LABEL: _Z3hooii: +; CHECK-LINUX: stp x29, x30, [sp, #-48]! +; CHECK-LINUX-NEXT: bl OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29 +; CHECK-LINUX: bl _Z3gooi +; CHECK-LINUX: bl _Z3gooi +; CHECK-LINUX: bl OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29 +; CHECK-LINUX-NEXT: b _Z3gooi + +define i32 @_Z3hooii(i32 %b, i32 %a) nounwind ssp minsize { + %1 = tail call i32 @_Z3gooi(i32 %b) + %2 = tail call i32 @_Z3gooi(i32 %a) + %3 = add i32 %a, %b + %4 = add i32 %3, %1 + %5 = add i32 %4, %2 + %6 = tail call i32 @_Z3gooi(i32 %5) + ret i32 %6 +} + +declare i32 @_Z3gooi(i32); + + +; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_x30x29x19x20x21x22: +; CHECK: stp x22, x21, [sp, #-32]! +; CHECK-NEXT: stp x20, x19, [sp, #16] +; CHECK-NEXT: ret + +; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22: +; CHECK: mov x16, x30 +; CHECK-NEXT: ldp x29, x30, [sp, #32] +; CHECK-NEXT: ldp x20, x19, [sp, #16] +; CHECK-NEXT: ldp x22, x21, [sp], #48 +; CHECK-NEXT: ret x16 + +; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29: +; CHECK-LINUX: stp x22, x21, [sp, #16] +; CHECK-LINUX-NEXT: stp x20, x19, [sp, #32] +; CHECK-LINUX-NEXT: ret + +; CHECK-LINUX-LABEL: OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29: +; CHECK-LINUX: mov x16, x30 +; CHECK-LINUX-NEXT: ldp x20, x19, [sp, #32] +; CHECK-LINUX-NEXT: ldp x22, x21, [sp, #16] +; CHECK-LINUX-NEXT: ldp x29, x30, [sp], #48 +; CHECK-LINUX-NEXT: ret x16