Index: include/llvm/CodeGen/AsmPrinter.h =================================================================== --- include/llvm/CodeGen/AsmPrinter.h +++ include/llvm/CodeGen/AsmPrinter.h @@ -22,6 +22,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/ErrorHandling.h" @@ -153,6 +154,24 @@ /// maintains ownership of the emitters. SmallVector Handlers; + // FIXME: ShrinkWrap2: Find a way to emit CFI directives compatible with + // shrink-wrapping. We now emit .cfi_offset and .cfi_restore for saves and + // restores, we re-process them to see if the final layout needs more work or + // not based on the block order. + + typedef DenseMap CSRMap; + + // FIXME: This shouldn't be here. + DenseMap RegToCSRIdx; + + // FIXME: ShrinkWrap2: Compute CFI save / restore directives based on the + // final layout. + CSRMap ExtraSaveCFI; + CSRMap ExtraRestoreCFI; + + // FIXME: ShrinkWrap2: How does this work with stack shrink-wrapping. Is there + // a way to "restore" everything? + public: struct SrcMgrDiagInfo { SourceMgr SrcMgr; @@ -294,12 +313,14 @@ void emitFrameAlloc(const MachineInstr &MI); enum CFIMoveType { CFI_M_None, CFI_M_EH, CFI_M_Debug }; - CFIMoveType needsCFIMoves(); + CFIMoveType needsCFIMoves() const; // FIXME: ShrinkWrap2: Separate commit. /// Returns false if needsCFIMoves() == CFI_M_EH for any function /// in the module. bool needsOnlyDebugCFIMoves() const { return isCFIMoveForDebugging; } + void generateShrinkWrappingCFI(); + bool needsSEHMoves(); /// Print to the current output stream assembly representations of the Index: include/llvm/CodeGen/MachineFrameInfo.h =================================================================== --- include/llvm/CodeGen/MachineFrameInfo.h +++ include/llvm/CodeGen/MachineFrameInfo.h @@ -15,6 +15,9 @@ #define LLVM_CODEGEN_MACHINEFRAMEINFO_H #include "llvm/ADT/SmallVector.h" +// FIXME: ShrinkWrap2: Temporary hack. Remove. +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/Support/DataTypes.h" #include #include @@ -22,7 +25,6 @@ namespace llvm { class raw_ostream; class MachineFunction; -class MachineBasicBlock; class BitVector; class AllocaInst; @@ -42,6 +44,17 @@ void setFrameIdx(int FI) { FrameIdx = FI; } }; +/// Map a set of registers to a basic block. This is a replacement for CSInfo +/// with extra information about the location of the saves / restores pinned +/// to a basic block. One register may appear more than once in the map, as +/// long as it is associated to a different basic block. The CSIs may share +/// frame indexes for different registers, for different basic blocks. +/// Similar to CSInfo, the frame indexes in the CalleeSavedInfo struct are +/// valid ony if CSIValid is true. +// FIXME: ShrinkWrap2: Make this a DenseMap +typedef DenseMap> + CalleeSavedMap; + /// The MachineFrameInfo class represents an abstract stack frame until /// prolog/epilog code is inserted. This class is key to allowing stack frame /// representation optimizations, such as frame pointer elimination. It also @@ -266,12 +279,22 @@ /// stack objects like arguments so we can't treat them as immutable. bool HasTailCall = false; + // FIXME: ShrinkWrap2: Deprecate. /// Not null, if shrink-wrapping found a better place for the prologue. MachineBasicBlock *Save = nullptr; /// Not null, if shrink-wrapping found a better place for the epilogue. MachineBasicBlock *Restore = nullptr; +private: + /// Should the PrologEpilogInserter and the various target hooks use the + /// information gathered from shrink-wrapping? + // FIXME: ShrinkWrap2: Fix name. + // FIXME: ShrinkWrap2: Merge shrink-wrapped / non-shrink-wrapped paths. + bool ShouldUseShrinkWrap2 = false; + public: + // FIXME: ShrinkWrap2: Temporary hack. Remove. + RegScavenger *RS; explicit MachineFrameInfo(unsigned StackAlignment, bool StackRealignable, bool ForcedRealign) : StackAlignment(StackAlignment), StackRealignable(StackRealignable), @@ -658,11 +681,24 @@ void setCalleeSavedInfoValid(bool v) { CSIValid = v; } + // FIXME: ShrinkWrap2: Merge with multiple points. MachineBasicBlock *getSavePoint() const { return Save; } void setSavePoint(MachineBasicBlock *NewSave) { Save = NewSave; } MachineBasicBlock *getRestorePoint() const { return Restore; } void setRestorePoint(MachineBasicBlock *NewRestore) { Restore = NewRestore; } + // FIXME: ShrinkWrap2: Is this the right place for this? This should be + // somewhere in PEI or TargetFrameLowering, since they are the only ones using + // it. + // FIXME: ShrinkWrap2: This gets really messy and we should merge all the + // behaviour for both shrink-wrapping passes and with it disabled. + // FIXME: ShrinkWrap2: Name. + // FIXME: ShrinkWrap2: Merge shrink-wrapped / non-shrink-wrapped paths. + bool getShouldUseShrinkWrap2() const { return ShouldUseShrinkWrap2; } + // FIXME: ShrinkWrap2: Name. + // FIXME: ShrinkWrap2: Merge shrink-wrapped / non-shrink-wrapped paths. + void setShouldUseShrinkWrap2(bool New) { ShouldUseShrinkWrap2 = New; } + /// Return a set of physical registers that are pristine. /// /// Pristine registers hold a value that is useless to the current function, Index: include/llvm/CodeGen/ShrinkWrapper.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/ShrinkWrapper.h @@ -0,0 +1,331 @@ +//===- llvm/CodeGen/ShrinkWrapper.h - Shrink Wrapping Utility ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This class is the main utility to provide shrink-wrapping properties to any +// kind of attributes. This is used to do callee-saved registers and stack +// shrink-wrapping. The algorithm is based on "Minimizing Register Usage Penalty +// at Procedure Calls - Fred C. Chow" [1], with the usage of SCCs to exclude +// loops and provide a linear pass instead of a complete dataflow analysis. +// FIXME: ShrinkWrap2: Random thoughts: +// - r193749 removed an old pass that was an implementation of [1]. +// - Cost model: use MachineBlockFrequency and some instruction cost model? +// - Split critical edges on demand? +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_SHRINKWRAP_H +#define LLVM_CODEGEN_SHRINKWRAP_H + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/PostOrderIterator.h" + +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +class MachineBlockFrequencyInfo; +class MachineOptimizationRemarkEmitter; + +/// Information about the requirements on shrink-wrapping. This should describe +/// what does "used" mean, and it should be the main interface to work with the +/// targets and other shrink-wrappable inputs. +class ShrinkWrapInfo { +protected: + /// Track all the uses per basic block. + SmallVector Uses; + + /// The machine function we're shrink-wrapping. + const MachineFunction &MF; + + /// Generic code to determine callee saved register uses. This checks for + /// regmasks, and tracks all the register units. + /// If there is an use on a terminator, the successors will also be marked as + /// used. + // FIXME: ShrinkWrap2: Make this a free-function outside shrink-wrapping. + void determineCSRUses(); + +public: + ShrinkWrapInfo(const MachineFunction &MF) + : Uses(MF.getNumBlockIDs()), MF(MF) {} + /// Get the number of results we want per block. i.e. number of registers in + /// the target. + virtual unsigned getNumResultBits() const { return 0; } + + /// Get the elements that are used for a particular basic block. The result is + /// `nullptr` if there are no uses. + virtual const BitVector *getUses(unsigned MBBNum) const; + + /// Provide a way to print elements. Debug only. + // FIXME: ShrinkWrap2: Add DUMP macros. + virtual raw_ostream &printElt(unsigned Elt, raw_ostream &OS) const { + OS << Elt; + return OS; + }; + + virtual ~ShrinkWrapInfo() = default; +}; + +/// Iterator for successors / predecessors. This is here to work with +/// SmallVector and std::vector at the same time. +// FIXME: ShrinkWrap2: Use ArrayRef? +typedef const MachineBasicBlock *const *MBBIterator; + +class ShrinkWrapper { + typedef BitVector MBBSet; + /// Result type used to store results / uses. The target decides the meaning + /// of the bits. + typedef BitVector TargetResultSet; + // Idx = MBB.getNumber() + typedef SmallVector BBResultSetMap; + typedef DenseMap SparseBBResultSetMap; + + /// The shrink-wrapping analysis is based on two properties: + /// * Anticipation: + /// The use of a register is ancicipated at a given point if a use of the + /// register will be encountered in all possible execution paths leading from + /// that point. + + /// * Availability: + /// The use of a register is available at a given point if a use of the + /// register has been encountered in all possible execution paths that lead to + /// that point. + + /// Both attributes are propagated at the beginning and at the end of a block + /// (which could be an SCC, or a basic block). + // FIXME: ShrinkWrap2: Remove OUT/IN. + struct SWAttributes { + /// Is the element anticipated at the beginning of this block? + TargetResultSet ANTIN; + /// Is the element available at the end of this block? + TargetResultSet AVOUT; + + /// Resize all the sets. + SWAttributes(const ShrinkWrapInfo &SWI) { + unsigned Max = SWI.getNumResultBits(); + for (TargetResultSet *BV : {&ANTIN, &AVOUT}) + (*BV).resize(Max); + } + }; + + // Idx = MBB.getNumber() + typedef SmallVector AttributeMap; + + /// An SCC that was discovered through the scc_iterator on the function. + /// This is used in order to detect loops, reducible *AND* irreducible. + struct SCCLoop { + typedef SmallVector MBBVector; + /// The successors of the SCC. These are blocks outside the SCC. + SetVector Successors; + iterator_range successors() const { + return {&*Successors.begin(), &*Successors.end()}; + } + /// The predecessors of the SCC. These are blocks outside the SCC. + SetVector Predecessors; + iterator_range predecessors() const { + return {&*Predecessors.begin(), &*Predecessors.end()}; + } + /// This number is the number of the first MBB in the SCC. + unsigned Number; + unsigned getNumber() const { return Number; } + /// The number of blocks the SCC contains. + unsigned Size; + unsigned getSize() const { return Size; } + }; + + /// Wrapper around scc_iterator that collects SCCs that are loops, computes + /// their successor / predecessor and assigns an unique number based on the + /// basic blocks it contains. + struct SCCLoopInfo { + /// Own the SCCs. + SmallVector SCCs; + /// Map a basic block number to an SCCLoop number. The SCCLoop number is + /// the position in the `SCCs` vector, and it is differrent from the + /// SCCLoop::Number attribute, which is the first basic block's number in + /// the SCC. + DenseMap MBBToSCC; + + /// Initialize the successors / predecessors of the SCCLoops. + SCCLoopInfo(const MachineFunction &MF); + /// Get the SCCLoop for a designated basic block number. If there is no + /// SCCLoop associated, return `nullptr`. + SCCLoop *getSCCLoopFor(unsigned MBBNum) { + auto It = MBBToSCC.find(MBBNum); + if (It == MBBToSCC.end()) + return nullptr; + return &SCCs[It->second]; + } + const SCCLoop *getSCCLoopFor(unsigned MBBNum) const { + return const_cast(this)->getSCCLoopFor(MBBNum); + } + }; + + /// The MachineFunction we're working on. + const MachineFunction &MF; + + /// Target-found uses. + // FIXME: ShrinkWrap2: Use the one from ShrinkWrapInfo, but detecting critical + // edges may need to modify it. + BBResultSetMap Uses; + + // FIXME: ShrinkWrap2: Is this the correct place to compute this? + /// Blocks that never return. + MBBSet NoReturnBlocks; + + /// Target-specific shrink-wrap information. + std::unique_ptr SWI; + + /// The replacement for the MachineLoopInfo, that handles irreducible loops + /// as well. + SCCLoopInfo SI; + + /// Final results. + SparseBBResultSetMap Saves; + SparseBBResultSetMap Restores; + + /// Number of times the attributes have been recomputed because of critical + /// edges. + unsigned AttributesRecomputed = 0; + + /// All the elements encountered so far. + TargetResultSet AllElts; + + /// The CFG we're working on is no longer composed of basic blocks. It's + /// basically the CFG of SCCs, and we're using numbers to identify nodes. A + /// simple basic block's number is MBB->getNumber(), and a SCC that is a + /// loop gets the number of the first basic block encountered. For that, + /// we're using the following functions to traverse our CFG. + + /// Get the block number or the SCCLoop's number. + unsigned blockNumber(unsigned MBBNum) const; + /// Get the block successors or the SCCLoop exit blocks. + iterator_range blockSuccessors(unsigned MBBNum) const; + /// Get the block predecessors or the SCCLoop's predecessors. + iterator_range blockPredecessors(unsigned MBBNum) const; + + /// Anticipability + // If there is an use of this on *all* the paths starting from + // this basic block, the element is anticipated at the end of this + // block. + // (propagate the IN attribute of successors to possibly merge saves) + // - + // | *false* if no successor. + // ANTOUT = | + // | && ANTIN(succ[i]) otherwise. + // + bool ANTOUT(const AttributeMap &Attrs, unsigned MBBNum, unsigned Elt) const { + auto Successors = blockSuccessors(MBBNum); + if (Successors.begin() == Successors.end()) + return false; + return all_of(Successors, [&](const MachineBasicBlock *S) { + return Attrs[blockNumber(S->getNumber())].ANTIN.test(Elt); + }); + } + + /// Availability + // If there is an use of this on *all* the paths arriving in this block, + // then the element is available in this block (propagate the out attribute + // of predecessors to possibly merge restores). + // - + // | *false* if no predecessor. + // AVIN = | + // | && AVOUT(pred[i]) otherwise. + // - + bool AVIN(const AttributeMap &Attrs, unsigned MBBNum, unsigned Elt) const { + auto Predecessors = blockPredecessors(MBBNum); + if (Predecessors.begin() == Predecessors.end()) + return false; + return all_of(Predecessors, [&](const MachineBasicBlock *P) { + return Attrs[blockNumber(P->getNumber())].AVOUT.test(Elt); + }); + } + + /// Determine uses based on ShrinkWrapInfo. + // FIXME: ShrinkWrap2: Remove. Call SWI directly. + void determineUses(); + /// Remove uses and fill NoReturnBlocks with the blocks that we know are not + /// going to return from the function. + /// FIXME: ShrinkWrap2: Is this the correct place to compute this? + void removeUsesOnNoReturnPaths(); + void dumpUses() const; + /// Mark all the basic blocks / SCCs around a loop (pred, succ) as used, + /// if there is an usage of a CSR inside a loop. We want to avoid any save / + /// restore operations inside a loop. + void markUsesOutsideLoops(); + + /// Compute the attributes for one element. + // FIXME: ShrinkWrap2: Don't do this per element. + void computeAttributes( + unsigned Elt, AttributeMap &Attrs, + ReversePostOrderTraversal &RPOT) const; + /// Save the results for this particular element. + // FIXME: ShrinkWrap2: Don't do this per element. + void gatherAttributesResults(unsigned Elt, AttributeMap &Attrs); + /// Check for critical edges and mark new blocks as needed. + // FIXME: ShrinkWrap2: Don't do this per element. + bool hasCriticalEdges(unsigned Elt, AttributeMap &Attrs); + /// Dump the contents of the attributes. + // FIXME: ShrinkWrap2: Don't do this per element. + void dumpAttributes(unsigned Elt, const AttributeMap &Attrs) const; + + /// * Verify if the results are better than obvious results, like: + /// * CSR used in a single MBB: only one save and one restore. + /// * Remove empty entries from the Saves / Restores maps. + // FIXME: ShrinkWrap2: This shouldn't happen, we better fix the algorithm + // first. + void postProcessResults(const BBResultSetMap &OldUses); + /// Compute the shrink-wrapping cost, which is based on block frequency. + unsigned computeShrinkWrappingCost(MachineBlockFrequencyInfo *MBFI) const; + /// Compute the same cost, in entry / return blocks, which is based on block + /// frequency. + unsigned computeDefaultCost(MachineBlockFrequencyInfo *MBFI) const; + /// Verify save / restore points by walking the CFG. + /// This asserts if anything went wrong. + // FIXME: ShrinkWrap2: Should this be guarded by a macro? + void verifySavesRestores() const; + + /// Dump the final shrink-wrapping results. + void dumpResults() const; + +public: + /// Run the shrink-wrapper on the function. If there are no uses, there will + /// be no saves / restores. + /// By default, run the shrink-wrapper with the target's CSRShrinkWrapInfo. + ShrinkWrapper(const MachineFunction &MF); + /// Run the shrink-wrapper with a custom ShrinkWrapInfo. + ShrinkWrapper(const MachineFunction &MF, std::unique_ptr SWI); + + /// Check if the function has any uses that can be shrink-wrapped. + bool hasUses() const { return !Uses.empty(); } + + /// Get the target's shrink-wrap info. + ShrinkWrapInfo &getSWI() { return *SWI; }; + const ShrinkWrapInfo &getSWI() const { return *SWI; }; + + /// Get the final results. + const SparseBBResultSetMap &getSaves() { return Saves; } + const SparseBBResultSetMap &getRestores() { return Restores; } + + /// Emit optimization remarks for the whole function. + void emitRemarks(MachineOptimizationRemarkEmitter *ORE, + MachineBlockFrequencyInfo *MBFI) const; + + /// Check that the final results are better than the default behaviour. + bool areResultsInteresting(MachineBlockFrequencyInfo *MBFI) const; +}; + +} // end namespace llvm + +#endif // LLVM_CODEGEN_SHRINKWRAP_H Index: include/llvm/Target/TargetFrameLowering.h =================================================================== --- include/llvm/Target/TargetFrameLowering.h +++ include/llvm/Target/TargetFrameLowering.h @@ -15,6 +15,8 @@ #define LLVM_TARGET_TARGETFRAMELOWERING_H #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/ShrinkWrapper.h" #include #include @@ -23,6 +25,7 @@ class CalleeSavedInfo; class MachineFunction; class RegScavenger; + class ShrinkWrapInfo; /// Information about stack frame layout on the target. It holds the direction /// of stack growth, the known stack alignment on entry to each function, and @@ -326,6 +329,13 @@ return true; } + // FIXME: ShrinkWrap2: Yet another target hook to be removed later. See + // comment in PrologEpilogInserter.cpp:579 + virtual void + processValidCalleeSavedInfo(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const {} + /// Check if given function is safe for not having callee saved registers. /// This is used when interprocedural register allocation is enabled. static bool isSafeForNoCSROpt(const Function *F) { @@ -339,6 +349,13 @@ return false; return true; } + + /// Provide all the target-hooks needed for shrink-wrapping. + virtual std::unique_ptr + createCSRShrinkWrapInfo(const MachineFunction &MF) const { + llvm_unreachable("Target didn't implement a ShrinkWrapInfo subclass!"); + return nullptr; + } }; } // End llvm namespace Index: lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" @@ -899,8 +900,13 @@ return true; } -AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() { - if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI && +AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() const { + ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType(); + if (ExceptionHandlingType != ExceptionHandling::DwarfCFI && + ExceptionHandlingType != ExceptionHandling::ARM) + return CFI_M_None; + + if (ExceptionHandlingType == ExceptionHandling::DwarfCFI && MF->getFunction()->needsUnwindTableEntry()) return CFI_M_EH; @@ -910,16 +916,135 @@ return CFI_M_None; } +void AsmPrinter::generateShrinkWrappingCFI() { + // Reset everything. + ExtraSaveCFI.clear(); + ExtraRestoreCFI.clear(); + + // FIXME: ShrinkWrap2: Gather all the saving points (based on CFI). + CSRMap Saves; + // FIXME: ShrinkWrap2: Gather all the restoring points (based on CFI). + CSRMap Restores; + + const MCRegisterInfo *MCRI = MF->getMMI().getContext().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Collect all the CSRs and their index. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned DwarfReg = MCRI->getDwarfRegNum(CSRegs[i], true); + unsigned Reg = MCRI->getLLVMRegNum(DwarfReg, false); + RegToCSRIdx[Reg] = i; + } + + // First pass, collect .cfi_offset and .cfi_restore directives: + // * .cfi_offset represents a csr save + // * .cfi_restore represents a csr restore + for (const MachineBasicBlock &MBB : *MF) { + for (const MachineInstr &MI : MBB) { + if (!MI.isCFIInstruction()) + continue; + const std::vector &Instrs = MF->getFrameInstructions(); + unsigned CFIIndex = MI.getOperand(0).getCFIIndex(); + const MCCFIInstruction &CFI = Instrs[CFIIndex]; + + // Check if it's a save. + if (CFI.getOperation() == MCCFIInstruction::OpOffset) { + unsigned DwarfReg = CFI.getRegister(); + unsigned Reg = MCRI->getLLVMRegNum(DwarfReg, false); + if (RegToCSRIdx.count(Reg)) { + BitVector &Save = Saves[MBB.getNumber()]; + Save.resize(RegToCSRIdx.size()); + Save.set(RegToCSRIdx[Reg]); + } + } + + // Check if it's a restore. + if (CFI.getOperation() == MCCFIInstruction::OpRestore) { + unsigned DwarfReg = CFI.getRegister(); + unsigned Reg = MCRI->getLLVMRegNum(DwarfReg, false); + if (RegToCSRIdx.count(Reg)) { + BitVector &Restore = Restores[MBB.getNumber()]; + Restore.resize(RegToCSRIdx.size()); + Restore.set(RegToCSRIdx[Reg]); + } + } + } + } + + // Compute the "liveness" of the CSRs. A CSR is live if it has been saved, + // and killed if it has been restored. + SmallVector LiveCSRs{MF->getNumBlockIDs()}; + for (BitVector &BV : LiveCSRs) + BV.resize(RegToCSRIdx.size()); + + ReversePostOrderTraversal RPOT(MF); + for (const MachineBasicBlock *MBB : RPOT) { + BitVector &LiveHere = LiveCSRs[MBB->getNumber()]; + // LIVE(MBB) += LIVE(EACH_PRED) - RESTORE(EACH_PRED) + SAVE(MBB) + // Propagate the liveness information. + for (const MachineBasicBlock *Pred : MBB->predecessors()) + LiveHere |= LiveCSRs[Pred->getNumber()]; + // If any of the predecessors restored any CSR, kill them. + for (const MachineBasicBlock *Pred : MBB->predecessors()) { + auto Found = Restores.find(Pred->getNumber()); + if (Found == Restores.end()) + continue; + BitVector &Killed = Found->second; + LiveHere.flip(); + LiveHere |= Killed; + LiveHere.flip(); + } + // If this block saved any CSRs, make them live. + auto Found = Saves.find(MBB->getNumber()); + if (Found == Saves.end()) + continue; + BitVector &Saved = Found->second; + LiveHere |= Saved; + } + + // Now compute the state changes we need in between the blocks. + BitVector LastState(RegToCSRIdx.size()); + for (const MachineBasicBlock &MBB : *MF) { + BitVector &LiveHere = LiveCSRs[MBB.getNumber()]; + if (&MBB != &MF->front()) { + auto Prev = std::prev(MBB.getIterator()); + auto Found = Restores.find(Prev->getNumber()); + if (Found != Restores.end() && !Found->second.empty()) { + BitVector &Killed = Found->second; + LastState.flip(); + LastState |= Killed; + LastState.flip(); + } + } + + // Save everything that is added in the current state and was not there in + // the last one (and the saves that are already here). + BitVector ToSave = LastState; + ToSave |= Saves[MBB.getNumber()]; + ToSave.flip(); + ToSave &= LiveHere; + if (ToSave.count()) + ExtraSaveCFI[MBB.getNumber()] = std::move(ToSave); + + // Restore everything that is not in the current state anymore but it was + // in the last one. + BitVector ToRestore = LastState; + ToRestore.flip(); + ToRestore |= LiveHere; + ToRestore.flip(); + if (ToRestore.count()) + ExtraRestoreCFI[MBB.getNumber()] = std::move(ToRestore); + + LastState = LiveHere; + } +} + bool AsmPrinter::needsSEHMoves() { return MAI->usesWindowsCFI() && MF->getFunction()->needsUnwindTableEntry(); } void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) { - ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType(); - if (ExceptionHandlingType != ExceptionHandling::DwarfCFI && - ExceptionHandlingType != ExceptionHandling::ARM) - return; - if (needsCFIMoves() == CFI_M_None) return; @@ -1429,6 +1554,14 @@ EnablePrintSchedInfo = PrintSchedule.getNumOccurrences() ? PrintSchedule : STI.supportPrintSchedInfo(); + + if (needsCFIMoves() == CFI_M_None) + return; + + // FIXME: ShrinkWrap2: Compute the blocks that need CFI state switching. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.getShouldUseShrinkWrap2()) + generateShrinkWrappingCFI(); } namespace { @@ -2659,6 +2792,42 @@ } else { OutStreamer->EmitLabel(MBB.getSymbol()); } + + // FIXME: ShrinkWrap2: Insert the CFI that are needed to do the transition + // between each block. + if (needsCFIMoves() == CFI_M_None) + return; + + DenseMap CSRIdxToCSIIdx; + const MCRegisterInfo *MCRI = MF->getMMI().getContext().getRegisterInfo(); + const MachineFrameInfo &MFI = MF->getFrameInfo(); + const std::vector &CSIs = MFI.getCalleeSavedInfo(); + for (auto &KV : enumerate(CSIs)) { + const CalleeSavedInfo &CSI = KV.value(); + unsigned Reg = CSI.getReg(); + unsigned DwarfReg = MCRI->getDwarfRegNum(Reg, true); + Reg = MCRI->getLLVMRegNum(DwarfReg, false); + unsigned CSIIdx = KV.index(); + CSRIdxToCSIIdx[RegToCSRIdx.lookup(Reg)] = CSIIdx; + } + + if (MFI.getShouldUseShrinkWrap2()) { + const MCRegisterInfo *MRI = MF->getMMI().getContext().getRegisterInfo(); + for (unsigned CSRIdx : ExtraSaveCFI.lookup(MBB.getNumber()).set_bits()) { + const CalleeSavedInfo &CSI = CSIs[CSRIdxToCSIIdx[CSRIdx]]; + int64_t Offset = MFI.getObjectOffset(CSI.getFrameIdx()); + unsigned DwarfReg = MRI->getDwarfRegNum(CSI.getReg(), true); + // .cfi_offset %reg, off + emitCFIInstruction( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + } + for (unsigned CSRIdx : ExtraRestoreCFI.lookup(MBB.getNumber()).set_bits()) { + const CalleeSavedInfo &CSI = CSIs[CSRIdxToCSIIdx[CSRIdx]]; + unsigned DwarfReg = MRI->getDwarfRegNum(CSI.getReg(), true); + // .cfi_restore %reg + emitCFIInstruction(MCCFIInstruction::createRestore(nullptr, DwarfReg)); + } + } } void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility, Index: lib/CodeGen/AsmPrinter/CodeViewDebug.cpp =================================================================== --- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -1020,6 +1020,7 @@ // non-frame setup location marks the beginning of the function body. // FIXME: is there a simpler a way to do this? Can we just search // for the first instruction of the function, not the last of the prolog? + // FIXME: ShrinkWrap2: This won't work with shrink-wrapping, I guess. DebugLoc PrologEndLoc; bool EmptyPrologue = true; for (const auto &MBB : *MF) { Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -127,6 +127,8 @@ ScoreboardHazardRecognizer.cpp ShadowStackGCLowering.cpp ShrinkWrap.cpp +# FIXME: ShrinkWrap2: Merge. + ShrinkWrapper.cpp SjLjEHPrepare.cpp SlotIndexes.cpp SpillPlacement.cpp Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -20,15 +20,18 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/ShrinkWrapper.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/InlineAsm.h" @@ -41,18 +44,20 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/MC/MCAsmInfo.h" #include using namespace llvm; #define DEBUG_TYPE "prologepilog" +// FIXME: ShrinkWrap2: Fix name. +cl::opt + EnableShrinkWrap2Opt("enable-shrink-wrap2", cl::Hidden, + cl::desc("enable the shrink-wrapping 2 pass")); + typedef SmallVector MBBVector; -static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCXFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks); namespace { class PEI : public MachineFunctionPass { @@ -64,6 +69,9 @@ void getAnalysisUsage(AnalysisUsage &AU) const override; + /// \brief Check if shrink wrapping is enabled for this target and function. + bool isShrinkWrapEnabled(const MachineFunction &MF); + MachineFunctionProperties getRequiredProperties() const override { MachineFunctionProperties MFP; if (UsesCalleeSaves) @@ -77,16 +85,13 @@ bool runOnMachineFunction(MachineFunction &Fn) override; private: - std::function - SpillCalleeSavedRegisters; + std::function SpillCalleeSavedRegisters; std::function ScavengeFrameVirtualRegs; bool UsesCalleeSaves = false; + // FIXME: ShrinkWrap2: Temporary hack. Remove. RegScavenger *RS; // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved @@ -94,6 +99,7 @@ unsigned MinCSFrameIndex = std::numeric_limits::max(); unsigned MaxCSFrameIndex = 0; + // FIXME: ShrinkWrap2: Merge the shrink-wrapping logic here. // Save and Restore blocks of the current function. Typically there is a // single save block, unless Windows EH funclets are involved. MBBVector SaveBlocks; @@ -108,6 +114,14 @@ // FrameIndexVirtualScavenging is used. bool FrameIndexEliminationScavenging; + // Emit optimization remarks. + MachineOptimizationRemarkEmitter *ORE; + + void doSpillCalleeSavedRegs(MachineFunction &MF); + void doSpillCalleeSavedRegsShrinkWrap2(MachineFunction &Fn, + CalleeSavedMap &Saves, + CalleeSavedMap &Restores); + void calculateCallFrameInfo(MachineFunction &Fn); void calculateSaveRestoreBlocks(MachineFunction &Fn); @@ -132,6 +146,8 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(StackProtector) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) INITIALIZE_PASS_END(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion & Frame Finalization", false, false) @@ -148,9 +164,57 @@ AU.addPreserved(); AU.addPreserved(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } +bool PEI::isShrinkWrapEnabled(const MachineFunction &MF) { + auto BecauseOf = [&](const char *Title, const char *Msg, DebugLoc Loc = {}) { + MachineOptimizationRemarkMissed R(DEBUG_TYPE, Title, Loc, &MF.front()); + R << "Couldn't shrink-wrap this function because " << Msg; + ORE->emit(R); + return false; + }; + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + + switch (EnableShrinkWrap2Opt) { + case cl::BOU_UNSET: { + if (MF.getTarget().getOptLevel() == CodeGenOpt::None) + return BecauseOf("ShrinkWrapDisabledOpt", + "shrink-wrapping is enabled at O1+."); + + if (!TFI->enableShrinkWrapping(MF)) + return BecauseOf("ShrinkWrapDisabledTarget", + "shrink-wrapping is not enabled on this target."); + // Windows with CFI has some limitations that make it impossible + // to use shrink-wrapping. + if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) + return BecauseOf("ShrinkWrapDisabledWindowsCFI", + "shrink-wrapping does not support Windows CFI yet."); + + // Sanitizers look at the value of the stack at the location + // of the crash. Since a crash can happen anywhere, the + // frame must be lowered before anything else happen for the + // sanitizers to be able to get a correct stack frame. + if (MF.getFunction()->hasFnAttribute(Attribute::SanitizeAddress)) + return BecauseOf("ShrinkWrapDisabledASAN", + "shrink-wrapping can't be enabled with ASAN."); + if (MF.getFunction()->hasFnAttribute(Attribute::SanitizeThread)) + return BecauseOf("ShrinkWrapDisabledTSAN", + "shrink-wrapping can't be enabled with TSAN."); + if (MF.getFunction()->hasFnAttribute(Attribute::SanitizeMemory)) + return BecauseOf("ShrinkWrapDisabledMSAN", + "shrink-wrapping can't be enabled with MSAN."); + } + case cl::BOU_TRUE: + return true; + case cl::BOU_FALSE: + return false; + } + llvm_unreachable("Invalid shrink-wrapping state"); +} /// StackObjSet - A set of stack object indexes typedef SmallSetVector StackObjSet; @@ -162,12 +226,12 @@ if (!SpillCalleeSavedRegisters) { const TargetMachine &TM = Fn.getTarget(); if (!TM.usesPhysRegsForPEI()) { - SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *, - unsigned &, unsigned &, const MBBVector &, - const MBBVector &) {}; + SpillCalleeSavedRegisters = [](MachineFunction &) {}; ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger &) {}; } else { - SpillCalleeSavedRegisters = doSpillCalleeSavedRegs; + SpillCalleeSavedRegisters = [this](MachineFunction &MF) { + return this->doSpillCalleeSavedRegs(MF); + }; ScavengeFrameVirtualRegs = scavengeFrameVirtualRegs; UsesCalleeSaves = true; } @@ -177,10 +241,14 @@ const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + MachineFrameInfo &MFI = Fn.getFrameInfo(); RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr; + // FIXME: ShrinkWrap2: Temporary hack. Remove. + MFI.RS = RS; FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn); FrameIndexEliminationScavenging = (RS && !FrameIndexVirtualScavenging) || TRI->requiresFrameIndexReplacementScavenging(Fn); + ORE = &getAnalysis().getORE(); // Calculate the MaxCallFrameSize and AdjustsStack variables for the // function's frame information. Also eliminates call frame pseudo @@ -192,8 +260,7 @@ calculateSaveRestoreBlocks(Fn); // Handle CSR spilling and restoring, for targets that need it. - SpillCalleeSavedRegisters(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex, - SaveBlocks, RestoreBlocks); + SpillCalleeSavedRegisters(Fn); // Allow the target machine to make final modifications to the function // before the frame layout is finalized. @@ -226,13 +293,14 @@ } // Warn on stack size when we exceeds the given limit. - MachineFrameInfo &MFI = Fn.getFrameInfo(); uint64_t StackSize = MFI.getStackSize(); if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) { DiagnosticInfoStackSize DiagStackSize(*F, StackSize); F->getContext().diagnose(DiagStackSize); } + // FIXME: ShrinkWrap2: Temporary hack. Remove. + MFI.RS = nullptr; delete RS; SaveBlocks.clear(); RestoreBlocks.clear(); @@ -306,6 +374,8 @@ // Use the points found by shrink-wrapping, if any. if (MFI.getSavePoint()) { + // FIXME: ShrinkWrap2: Remove check. + assert(!MFI.getShouldUseShrinkWrap2() && "Mixing shrink-wrapping passes."); SaveBlocks.push_back(MFI.getSavePoint()); assert(MFI.getRestorePoint() && "Both restore and save must be set"); MachineBasicBlock *RestoreBlock = MFI.getRestorePoint(); @@ -327,6 +397,95 @@ } } +/// Insert code that saves the callee saved registers used in the basic block. +static void insertCSRSaves(MachineBasicBlock &SaveBB, + ArrayRef CSIs) { + MachineFunction &Fn = *SaveBB.getParent(); + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); + + assert(!CSIs.empty() && "No saves to insert."); + + MachineBasicBlock::iterator I = SaveBB.begin(); + if (!TFI.spillCalleeSavedRegisters(SaveBB, I, CSIs, &TRI)) { + for (const CalleeSavedInfo &CSI : CSIs) { + unsigned Reg = CSI.getReg(); + + // Update liveness. + if (!Fn.getRegInfo().isLiveIn(Reg)) + SaveBB.addLiveIn(Reg); + + // Insert the spill to the stack frame. + // FIXME: ShrinkWrap2: Check if can be killed. + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(SaveBB, I, Reg, false, CSI.getFrameIdx(), RC, + &TRI); + std::prev(I)->setFlag(MachineInstr::FrameSetup); + + // FIXME: ShrinkWrap2: Check wether we need CFI, even though it is + // ignored by the AsmPrinter. + // Emit CFI for every CSR spill: + // .cfi_offset %reg, off + MachineFrameInfo &MFI = Fn.getFrameInfo(); + if (MFI.getShouldUseShrinkWrap2()) { + unsigned Offset = MFI.getObjectOffset(CSI.getFrameIdx()); + const MCRegisterInfo *MRI = Fn.getMMI().getContext().getRegisterInfo(); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = Fn.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + BuildMI(SaveBB, I, {}, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + } +} + +/// Insert code that restores the callee saved registers used in the basic +/// block. +static void insertCSRRestores(MachineBasicBlock &RestoreBB, + ArrayRef CSIs) { + MachineFunction &Fn = *RestoreBB.getParent(); + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); + + assert(!CSIs.empty() && "No restores to insert."); + + // Restore using target interface. + MachineBasicBlock::iterator I = RestoreBB.getFirstTerminator(); + + // Restore all registers immediately before the return and any terminators + // that precede it. + if (!TFI.restoreCalleeSavedRegisters(RestoreBB, I, CSIs, &TRI)) { + for (int i = CSIs.size() - 1; i >= 0; --i) { + unsigned Reg = CSIs[i].getReg(); + const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(RestoreBB, I, Reg, CSIs[i].getFrameIdx(), RC, + &TRI); + std::prev(I)->setFlag(MachineInstr::FrameDestroy); + + assert(I != RestoreBB.begin() && + "loadRegFromStackSlot didn't insert any code!"); + + // FIXME: ShrinkWrap2: Check wether we need CFI, even though it is + // ignored by the AsmPrinter. + // Emit CFI for every CSR restore. + // .cfi_restore %reg + MachineFrameInfo &MFI = Fn.getFrameInfo(); + if (MFI.getShouldUseShrinkWrap2()) { + MachineModuleInfo &MMI = Fn.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = + Fn.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfReg)); + BuildMI(RestoreBB, I, {}, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + } +} + static void assignCalleeSavedSpillSlots(MachineFunction &F, const BitVector &SavedRegs, unsigned &MinCSFrameIndex, @@ -398,6 +557,13 @@ } MFI.setCalleeSavedInfo(CSI); + // FIXME: ShrinkWrap2: AArch64FrameLowering needs to call + // computeCaleeSaveRegisterPairs *after* calling the generic code above. We + // could duplicate this code inside + // AArch64FrameLowering::assignCalleeSavedSpillSlots, but we need to update + // MinCSFrameIndex and MaxCSFrameIndex. + if (MFI.getShouldUseShrinkWrap2()) + TFI->processValidCalleeSavedInfo(F, RegInfo, CSI); } /// Helper function to update the liveness information for the callee-saved @@ -475,74 +641,133 @@ if (CSI.empty()) return; - const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); - const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); - const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); - MachineBasicBlock::iterator I; - // Spill using target interface. for (MachineBasicBlock *SaveBlock : SaveBlocks) { - I = SaveBlock->begin(); - if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) { - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - // Insert the spill to the stack frame. - unsigned Reg = CSI[i].getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(), - RC, TRI); - } - } + insertCSRSaves(*SaveBlock, CSI); // Update the live-in information of all the blocks up to the save point. updateLiveness(Fn); } // Restore using target interface. - for (MachineBasicBlock *MBB : RestoreBlocks) { - I = MBB->end(); - - // Skip over all terminator instructions, which are part of the return - // sequence. - MachineBasicBlock::iterator I2 = I; - while (I2 != MBB->begin() && (--I2)->isTerminator()) - I = I2; - - bool AtStart = I == MBB->begin(); - MachineBasicBlock::iterator BeforeI = I; - if (!AtStart) - --BeforeI; - - // Restore all registers immediately before the return and any - // terminators that precede it. - if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) { - for (unsigned i = 0, e = CSI.size(); i != e; ++i) { - unsigned Reg = CSI[i].getReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); - TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); - assert(I != MBB->begin() && - "loadRegFromStackSlot didn't insert any code!"); - // Insert in reverse order. loadRegFromStackSlot can insert - // multiple instructions. - if (AtStart) - I = MBB->begin(); - else { - I = BeforeI; - ++I; - } + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + insertCSRRestores(*RestoreBlock, CSI); +} + +// FIXME: ShrinkWrap2: Name. +void PEI::doSpillCalleeSavedRegsShrinkWrap2(MachineFunction &Fn, + CalleeSavedMap &Saves, + CalleeSavedMap &Restores) { + const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); + MachineFrameInfo &MFI = Fn.getFrameInfo(); + + // Now gather the callee-saved registers we found using shrink-wrapping. + // FIXME: ShrinkWrap2: We already gathered all the CSRs in ShrinkWrap. Reuse + // somehow? + BitVector ShrinkWrapSavedRegs(TRI.getNumRegs()); + for (auto &Save : Saves) + for (const CalleeSavedInfo &CSI : Save.second) + ShrinkWrapSavedRegs.set(CSI.getReg()); + + // FIXME: ShrinkWrap2: Re-use stack slots. + assignCalleeSavedSpillSlots(Fn, ShrinkWrapSavedRegs, MinCSFrameIndex, + MaxCSFrameIndex); + + MFI.setCalleeSavedInfoValid(true); + + if (Fn.getFunction()->hasFnAttribute(Attribute::Naked)) + return; + + // FIXME: ShrinkWrap2: This is awful. We first call + // assignCalleeSavedSpillSlots, that fills MFI.CalleeSavedInfo which is used + // for the ENTIRE function. Then, we need to reassign the FrameIdx back to the + // Saves / Restores map. + SmallVector *, unsigned>, 2> ToRemove; + const std::vector &CSIs = MFI.getCalleeSavedInfo(); + for (auto *Map : {&Saves, &Restores}) { + for (auto &Elt : *Map) { + for (const CalleeSavedInfo &CSI : Elt.second) { + unsigned Reg = CSI.getReg(); + // Look for the register in the assigned CSIs, and reassign it in the + // map. + auto It = find_if(CSIs, [&](const CalleeSavedInfo &NewCSI) { + return NewCSI.getReg() == Reg; + }); + if (It != CSIs.end()) + // FIXME: ShrinkWrap2: const_cast... + const_cast(CSI).setFrameIdx(It->getFrameIdx()); + else // Also, if we can't find it in the list, it means the target + // removed it. x86 does this for FP, since the spill is part of the + // prologue emission. + ToRemove.emplace_back(&Elt.second, Reg); } } } + for (auto& Pair : ToRemove) { + std::vector &V = *Pair.first; + unsigned Reg = Pair.second; + V.erase(std::remove_if(V.begin(), V.end(), + [&](const CalleeSavedInfo &CSI) { + return CSI.getReg() == Reg; + }), + V.end()); + } + + for (auto &Save : Saves) { + insertCSRSaves(*Save.first, Save.second); + // FIXME: ShrinkWrap2: Update liveness only after all spills / restores? + updateLiveness(Fn); + } + + for (auto &Restore : Restores) + insertCSRRestores(*Restore.first, Restore.second); } -static void doSpillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS, - unsigned &MinCSFrameIndex, - unsigned &MaxCSFrameIndex, - const MBBVector &SaveBlocks, - const MBBVector &RestoreBlocks) { +void PEI::doSpillCalleeSavedRegs(MachineFunction &Fn) { const Function *F = Fn.getFunction(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + MachineFrameInfo &MFI = Fn.getFrameInfo(); + MinCSFrameIndex = std::numeric_limits::max(); MaxCSFrameIndex = 0; + + /// If any, contains better save points for the prologue found by + /// shrink-wrapping. + CalleeSavedMap Saves; + /// If any, contains better restore points for the epilogue found by + /// shrink-wrapping. + CalleeSavedMap Restores; + + if (!Fn.empty() && isShrinkWrapEnabled(Fn)) { + ShrinkWrapper SW(Fn); + auto *MBFI = &getAnalysis(); + if (SW.areResultsInteresting(MBFI)) { + MachineFrameInfo &MFI = Fn.getFrameInfo(); + MFI.setShouldUseShrinkWrap2(true); + SW.emitRemarks(ORE, MBFI); + } + auto &SWSaves = SW.getSaves(); + auto &SWRestores = SW.getRestores(); + const MCPhysReg *CSRegs = Fn.getRegInfo().getCalleeSavedRegs(); + auto Transform = [&](const DenseMap &Src, + CalleeSavedMap &Dst) { + for (auto &KV : Src) { + MachineBasicBlock *MBB = Fn.getBlockNumbered(KV.first); + const BitVector &Regs = KV.second; + std::vector &CSI = Dst[MBB]; + + for (unsigned RegIdx : Regs.set_bits()) + CSI.emplace_back(CSRegs[RegIdx]); + } + }; + Transform(SWSaves, Saves); + Transform(SWRestores, Restores); + } + + // FIXME: ShrinkWrap2: Share code somehow. + if (MFI.getShouldUseShrinkWrap2()) + return doSpillCalleeSavedRegsShrinkWrap2(Fn, Saves, Restores); + // Determine which of the registers in the callee save list should be saved. BitVector SavedRegs; TFI->determineCalleeSaves(Fn, SavedRegs, RS); @@ -977,6 +1202,11 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) { const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + // FIXME: ShrinkWrap2: Stack alginment / adjustment / etc. go in emitPrologue. + // For now, we add these at the entry / exit of the function, and we spill + // callee saves using our own blocks. There should be a way to shrink-wrap the + // stack operations as well. + // Add prologue to the function... for (MachineBasicBlock *SaveBlock : SaveBlocks) TFI.emitPrologue(Fn, *SaveBlock); @@ -985,9 +1215,11 @@ for (MachineBasicBlock *RestoreBlock : RestoreBlocks) TFI.emitEpilogue(Fn, *RestoreBlock); + // FIXME: ShrinkWrap2: Will this still work? for (MachineBasicBlock *SaveBlock : SaveBlocks) TFI.inlineStackProbe(Fn, *SaveBlock); + // FIXME: ShrinkWrap2: Will this still work? // Emit additional code that is required to support segmented stacks, if // we've been asked for it. This, when linked with a runtime with support // for segmented stacks (libgcc is one), will result in allocating stack @@ -997,6 +1229,7 @@ TFI.adjustForSegmentedStacks(Fn, *SaveBlock); } + // FIXME: ShrinkWrap2: Will this still work? // Emit additional code that is required to explicitly handle the stack in // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The // approach is rather similar to that of Segmented Stacks, but it uses a Index: lib/CodeGen/ShrinkWrapper.cpp =================================================================== --- /dev/null +++ lib/CodeGen/ShrinkWrapper.cpp @@ -0,0 +1,891 @@ +//===- lib/CodeGen/ShrinkWrapper.cpp - Shrink Wrapping Utility --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Shrink-wrapper implementation. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/raw_ostream.h" +#include + +#include "llvm/CodeGen/ShrinkWrapper.h" + +// FIXME: ShrinkWrap2: Name +#define DEBUG_TYPE "shrink-wrap2" + +#define VERBOSE_DEBUG(X) \ + do { \ + if (VerboseDebug) \ + DEBUG(X); \ + } while (0); + +using namespace llvm; + +// FIXME: ShrinkWrap2: Remove ? +static cl::opt + VerboseDebug("shrink-wrap-verbose", cl::Hidden, + cl::desc("verbose debug output")); + +// FIXME: ShrinkWrap2: Remove, debug. +static cl::opt ViewCFGDebug("shrink-wrap-view", cl::Hidden, + cl::desc("view cfg")); + +void ShrinkWrapInfo::determineCSRUses() { + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Walk all the uses of each callee-saved register, and map them to their + // basic blocks. + const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + + BitVector CSRegUnits(TRI.getNumRegUnits()); + DenseMap RegUnitToCSRIdx; + for (unsigned i = 0; CSRegs[i]; ++i) { + for (MCRegUnitIterator RegUnit(CSRegs[i], &TRI); RegUnit.isValid(); + ++RegUnit) { + RegUnitToCSRIdx[*RegUnit] = i; + CSRegUnits.set(*RegUnit); + } + } + + auto MarkAsUsedBase = [&](unsigned RegIdx, unsigned MBBNum) { + + BitVector &Used = Uses[MBBNum]; + if (Used.empty()) + Used.resize(getNumResultBits()); + Used.set(RegIdx); + }; + auto MarkAsUsed = [&](unsigned RegIdx, const MachineBasicBlock &MBB, + bool isTerminator = false) { + unsigned MBBNum = MBB.getNumber(); + MarkAsUsedBase(RegIdx, MBBNum); + // If it's a terminator, mark the successors as used as well, + // since we can't save after a terminator (i.e. cbz w23, #10). + if (isTerminator) + for (MachineBasicBlock *Succ : MBB.successors()) + MarkAsUsedBase(RegIdx, Succ->getNumber()); + }; + + // FIXME: ShrinkWrap2: Naked functions. + // FIXME: ShrinkWrap2: __builtin_unwind_init. + + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &MO : MI.operands()) { + + if (MO.isRegMask()) { + // Check for regmasks only on the original CSR, as the aliases are not + // always there. + for (unsigned i = 0; CSRegs[i]; ++i) + if (MO.clobbersPhysReg(CSRegs[i])) + MarkAsUsed(i, MBB, MI.isTerminator()); + } else if (MO.isReg() && MO.getReg() && (MO.readsReg() || MO.isDef())) { + for (MCRegUnitIterator RegUnit(MO.getReg(), &TRI); RegUnit.isValid(); + ++RegUnit) + if (CSRegUnits.test(*RegUnit)) + MarkAsUsed(RegUnitToCSRIdx[*RegUnit], MBB, MI.isTerminator()); + } + } + } + } +} + +const BitVector *ShrinkWrapInfo::getUses(unsigned MBBNum) const { + auto& Use = Uses[MBBNum]; + if (Use.empty()) + return nullptr; + return &Use; +} + +ShrinkWrapper::SCCLoopInfo::SCCLoopInfo(const MachineFunction &MF) { + // Create the SCCLoops. + for (auto I = scc_begin(&MF); !I.isAtEnd(); ++I) { + // Skip non-loop SCCs. + if (!I.hasLoop()) + continue; + + SCCs.emplace_back(); + // The SCCLoop number is the first basic block number in the SCC. + unsigned Number = (*I->begin())->getNumber(); + SCCs.back().Number = Number; + SCCs.back().Size = I->size(); + + // The number used in MBBToSCC is the position of the SCC in `SCCs` + for (const MachineBasicBlock *MBB : *I) + MBBToSCC[MBB->getNumber()] = SCCs.size() - 1; + } + + // Compute successors / predecessors of the SCCLoops. + for (const MachineBasicBlock &MBB : MF) { + for (const MachineBasicBlock *Succ : MBB.successors()) { + SCCLoop *MBBSCC = getSCCLoopFor(MBB.getNumber()); + SCCLoop *SuccSCC = getSCCLoopFor(Succ->getNumber()); + // The successor is a loop, but not the current block. It means the + // successor's predecessor is the current block. + if (!MBBSCC && SuccSCC) + SuccSCC->Predecessors.insert(&MBB); + // The successor is not a loop, but the current block is one. It means + // that the loop's successor is the block's successor. + else if (MBBSCC && !SuccSCC) + MBBSCC->Successors.insert(Succ); + // The successor and the block are loops. We now need to connect SCCs + // together. + else if (MBBSCC && SuccSCC && MBBSCC != SuccSCC) { + MBBSCC->Successors.insert(Succ); + SuccSCC->Predecessors.insert(&MBB); + } + } + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + SCCLoop *MBBSCC = getSCCLoopFor(MBB.getNumber()); + SCCLoop *PredSCC = getSCCLoopFor(Pred->getNumber()); + // The predecessor is a loop, but not the current block. It means the + // predecessor's successor is the current block. + if (!MBBSCC && PredSCC) + PredSCC->Successors.insert(&MBB); + // The predecessor is not a loop, but the current block is one. It + // means that the loop's predecessor is the block's predecessor. + else if (MBBSCC && !PredSCC) + MBBSCC->Predecessors.insert(Pred); + // The successor and the block are loops. We now need to connect SCCs + // together. + else if (MBBSCC && PredSCC && MBBSCC != PredSCC) { + MBBSCC->Predecessors.insert(Pred); + PredSCC->Successors.insert(&MBB); + } + } + } +} + +unsigned ShrinkWrapper::blockNumber(unsigned MBBNum) const { + if (const SCCLoop *C = SI.getSCCLoopFor(MBBNum)) + return C->getNumber(); + return MBBNum; +} + +iterator_range +ShrinkWrapper::blockSuccessors(unsigned MBBNum) const { + if (const SCCLoop *C = SI.getSCCLoopFor(MBBNum)) + return {C->Successors.begin(), C->Successors.end()}; + const MachineBasicBlock *MBB = MF.getBlockNumbered(MBBNum); + return {&*MBB->succ_begin(), &*MBB->succ_end()}; +} + +iterator_range +ShrinkWrapper::blockPredecessors(unsigned MBBNum) const { + if (const SCCLoop *C = SI.getSCCLoopFor(MBBNum)) + return {C->Predecessors.begin(), C->Predecessors.end()}; + const MachineBasicBlock *MBB = MF.getBlockNumbered(MBBNum); + return {&*MBB->pred_begin(), &*MBB->pred_end()}; +} + +void ShrinkWrapper::determineUses() { + // FIXME: ShrinkWrap2: We do unnecessary copies here. + for (const MachineBasicBlock &MBB : MF) { + if (const TargetResultSet *Use = SWI->getUses(MBB.getNumber())) { + unsigned MBBNum = blockNumber(MBB.getNumber()); + Uses[MBBNum].resize(SWI->getNumResultBits()); + Uses[MBBNum] |= *Use; + } + } +} + +void ShrinkWrapper::removeUsesOnNoReturnPaths() { + NoReturnBlocks.resize(MF.getNumBlockIDs()); + + // Mark all reachable blocks from any return blocks. + for (const MachineBasicBlock &MBB : MF) + if (MBB.isReturnBlock()) + for (const MachineBasicBlock *Block : inverse_depth_first(&MBB)) + NoReturnBlocks.set(Block->getNumber()); + + // Flip, so that we can get the non-reachable blocks. + NoReturnBlocks.flip(); + + for (unsigned MBBNum : NoReturnBlocks.set_bits()) { + DEBUG(dbgs() << "Remove uses from no-return BB#" << MBBNum << '\n'); + Uses[MBBNum].clear(); + } +} + +void ShrinkWrapper::dumpUses() const { + for (const auto& Use : enumerate(Uses)) { + if (!Use.value().count()) + continue; + + dbgs() << "BB#" << Use.index() << " uses : "; + int Elt = Use.value().find_first(); + if (Elt >= 0) + SWI->printElt(Elt, dbgs()); + for (Elt = Use.value().find_next(Elt); Elt > 0; + Elt = Use.value().find_next(Elt)) { + dbgs() << ", "; + SWI->printElt(Elt, dbgs()); + } + dbgs() << '\n'; + } +} + +void ShrinkWrapper::markUsesOutsideLoops() { + // Keep track of the elements to attach to a basic block. + SparseBBResultSetMap ToInsert; + for (const auto &Use : enumerate(Uses)) { + unsigned MBBNum = Use.index(); + const TargetResultSet &Elts = Use.value(); + + auto Mark = [&](const MachineBasicBlock *Block) { + unsigned BlockNum = Block->getNumber(); + TargetResultSet &ToInsertTo = ToInsert[BlockNum]; + if (ToInsertTo.empty()) + ToInsertTo.resize(SWI->getNumResultBits()); + ToInsertTo |= Elts; + VERBOSE_DEBUG(dbgs() << "Mark: BB#" << BlockNum << '\n'); + }; + + if (const SCCLoop *C = SI.getSCCLoopFor(MBBNum)) { + DEBUG(dbgs() << "Loop for CSR: BB#" << MBBNum << '\n'); + + // Mark all the entry blocks of the loop. + for (const MachineBasicBlock *Block : C->predecessors()) + Mark(Block); + + // Mark all the exit blocks of the loop. + for (const MachineBasicBlock *Exit : C->successors()) + Mark(Exit); + } + } + + for (auto &KV : ToInsert) + Uses[blockNumber(KV.first)] |= KV.second; +} + +void ShrinkWrapper::computeAttributes( + unsigned Elt, AttributeMap &Attrs, + ReversePostOrderTraversal &RPOT) const { + auto UsesElt = [&](unsigned MBBNum) { + auto &Use = Uses[MBBNum]; + if (Use.empty()) + return false; + return Use.test(Elt); + }; + + auto Assign = [&](TargetResultSet &Set, bool New) { + if (Set.test(Elt) != New) + Set.flip(Elt); + }; + + // Count how many times we visited a SCCLoop. + DenseMap SCCVisited; + + // PO traversal for anticipation computation. We want to handle the SCC only + // when we reach the *LAST* component. + for (const MachineBasicBlock *MBB : make_range(RPOT.rbegin(), RPOT.rend())) { + unsigned MBBNum = MBB->getNumber(); + if (const SCCLoop *C = SI.getSCCLoopFor(MBB->getNumber())) { + if (++SCCVisited[C] != C->getSize()) + continue; + else + MBBNum = C->getNumber(); + } + + SWAttributes &Attr = Attrs[MBBNum]; + + // If the element is used in the block, or if it is anticipated in all + // successors it is also anticipated at the beginning, since we consider + // entire blocks. + // - + // ANTIN = | APP || ANTOUT + // - + TargetResultSet &ANTINb = Attr.ANTIN; + bool NewANTIN = UsesElt(MBBNum) || ANTOUT(Attrs, MBBNum, Elt); + Assign(ANTINb, NewANTIN); + } + + // Reuse the map. + SCCVisited.clear(); + + // RPO traversal for availability computation. We want to handle the SCC only + // when we reach the *FIRST* component. + for (const MachineBasicBlock *MBB : RPOT) { + unsigned MBBNum = MBB->getNumber(); + if (const SCCLoop *C = SI.getSCCLoopFor(MBB->getNumber())) { + if (++SCCVisited[C] != 1) + continue; + else + MBBNum = C->getNumber(); + } + + SWAttributes &Attr = Attrs[MBBNum]; + + // If the element is used in the block, or if it is always available in + // all predecessors , it is also available on exit, since we consider + // entire blocks. + // - + // AVOUT = | APP || AVIN + // - + TargetResultSet &AVOUTb = Attr.AVOUT; + bool NewAVOUT = UsesElt(MBBNum) || AVIN(Attrs, MBBNum, Elt); + Assign(AVOUTb, NewAVOUT); + } + + VERBOSE_DEBUG(dumpAttributes(Elt, Attrs)); +} + +bool ShrinkWrapper::hasCriticalEdges(unsigned Elt, AttributeMap &Attrs) { + bool Needs = false; + for (const MachineBasicBlock &MBB : MF) { + bool IsSCCLoop = false; + if (const SCCLoop *C = SI.getSCCLoopFor(MBB.getNumber())) { + // Skip all the blocks that are not the number of the SCC, since all the + // attributes are based on that number. + if (static_cast(MBB.getNumber()) != C->getNumber()) + continue; + else + IsSCCLoop = true; + } + + unsigned MBBNum = blockNumber(MBB.getNumber()); + // If the block is never returning, we won't bother saving / restoring. + if (NoReturnBlocks.test(MBBNum)) + continue; + + SWAttributes &Attr = Attrs[MBBNum]; + // Check if this block is ANTIN and has an incoming critical edge where it + // is not ANTIN. If it's the case, mark it as used, and recompute. + if (Attr.ANTIN.test(Elt)) { + auto Preds = blockPredecessors(MBBNum); + // We're looking for more than 2 predecessors. Also, if it's a SCCLoop, it + // has a predecessor that is itself. + if (std::distance(Preds.begin(), Preds.end()) >= 2 || IsSCCLoop) { + for (const MachineBasicBlock *P : Preds) { + unsigned PredNum = blockNumber(P->getNumber()); + SWAttributes &Attr = Attrs[PredNum]; + TargetResultSet &ANTINp = Attr.ANTIN; + if (!ANTINp.test(Elt)) { + // FIXME: ShrinkWrap2: emit remark. + VERBOSE_DEBUG(dbgs() + << "Incoming critical edge in " << MBBNum << ".\n"); + // Mark it as used. + TargetResultSet &Used = Uses[PredNum]; + if (Used.empty()) + Used.resize(SWI->getNumResultBits()); + Used.set(Elt); + + // Also, mark it as ANTIN and AVOUT, since we're not calling + // populateAttributes anymore. + ANTINp.set(Elt); + Attr.AVOUT.set(Elt); + Needs = true; + } + } + } + } + // Check if this block is AVOUT and has an outgoing critical edge where it + // is not AVOUT. If it's the case, mark it as used, and recompute. + if (Attr.AVOUT.test(Elt)) { + auto Succs = blockSuccessors(MBBNum); + // We're looking for more than 2 successors. Also, if it's a SCCLoop, it + // has a predecessor that is itself. + if (std::distance(Succs.begin(), Succs.end()) >= 2 || IsSCCLoop) { + for (const MachineBasicBlock *S : Succs) { + unsigned SuccNum = blockNumber(S->getNumber()); + SWAttributes &Attr = Attrs[SuccNum]; + TargetResultSet &AVOUTs = Attr.AVOUT; + if (!AVOUTs.test(Elt)) { + // FIXME: ShrinkWrap2: emit remark. + VERBOSE_DEBUG(dbgs() + << "Outgoing critical edge in " << MBBNum << ".\n"); + // Mark it as used. + TargetResultSet &Used = Uses[SuccNum]; + if (Used.empty()) + Used.resize(SWI->getNumResultBits()); + Used.set(Elt); + + // Also, mark it as AVOUT and ANTIN, since we're not calling + // populateAttrbutes anymore. + AVOUTs.set(Elt); + Attr.ANTIN.set(Elt); + Needs = true; + } + } + } + } + } + // Recompute if needed. + return Needs; +} + +void ShrinkWrapper::gatherAttributesResults(unsigned Elt, AttributeMap &Attrs) { + for (const MachineBasicBlock &MBB : MF) { + bool IsSCCLoop = false; + if (const SCCLoop *C = SI.getSCCLoopFor(MBB.getNumber())) { + // Skip all the blocks that are not the number of the SCC, since all the + // attributes are based on that number. + if (static_cast(MBB.getNumber()) != C->getNumber()) + continue; + else + IsSCCLoop = true; + } + + unsigned MBBNum = blockNumber(MBB.getNumber()); + // If the block is never returning, we won't bother saving / restoring. + if (NoReturnBlocks.test(MBBNum)) + continue; + + SWAttributes &Attr = Attrs[MBBNum]; + + // If the uses are anticipated on *all* the paths leaving this block, and if + // it is not available at the entry of this block (if it is, then it means + // it has been saved already, but not restored), and if *none* of the + // predecessors anticipates this element on their output (we want to get the + // "highest" block), then we can identify a save point for the function. + // + // SAVE = ANTIN && !AVIN && !ANTIN(pred[i]) + // + bool NS = + none_of(blockPredecessors(MBBNum), [&](const MachineBasicBlock *P) { + return Attrs[blockNumber(P->getNumber())].ANTIN.test(Elt); + }); + if (NS && Attr.ANTIN.test(Elt) && !AVIN(Attrs, MBBNum, Elt)) { + TargetResultSet &Save = Saves[MBBNum]; + if (Save.empty()) + Save.resize(SWI->getNumResultBits()); + Save.set(Elt); + } + + // If the uses are available on *all* the paths leading to this block, and + // if the element is not anticipated at the exit of this block (if it is, + // then it means it has been restored already), and if *none* of the + // successors make the element available (we want to cover the // deepest // + // use), then we can identify a restrore point for the function. + // + // RESTORE = AVOUT && !ANTOUT && !AVOUT(succ[i]) + // + bool NR = none_of(blockSuccessors(MBBNum), [&](const MachineBasicBlock *S) { + return Attrs[blockNumber(S->getNumber())].AVOUT.test(Elt); + }); + if (NR && Attr.AVOUT.test(Elt) && !ANTOUT(Attrs, MBBNum, Elt)) { + TargetResultSet &Restore = Restores[MBBNum]; + if (Restore.empty()) + Restore.resize(SWI->getNumResultBits()); + Restore.set(Elt); + } + } +} + +void ShrinkWrapper::dumpAttributes(unsigned Elt, + const AttributeMap &Attrs) const { + for (const MachineBasicBlock &MBB : MF) { + unsigned MBBNum = MBB.getNumber(); + if (const SCCLoop *C = SI.getSCCLoopFor(MBBNum)) + if (MBBNum != C->getNumber()) + continue; + const SWAttributes &Attr = Attrs[MBBNum]; + dbgs() << "BB#" << MBBNum << "<"; + SWI->printElt(Elt, dbgs()); + dbgs() << ">" + << ":\n\tANTOUT : " << ANTOUT(Attrs, MBBNum, Elt) << '\n' + << "\tANTIN : " << Attr.ANTIN.test(Elt) << '\n' + << "\tAVIN : " << AVIN(Attrs, MBBNum, Elt) << '\n' + << "\tAVOUT : " << Attr.AVOUT.test(Elt) << '\n'; + } +} + +void ShrinkWrapper::postProcessResults(const BBResultSetMap &OldUses) { + // If there is only one use of the element, and multiple saves / restores, + // remove them and place the save / restore at the used MBB's boundaries. + for (unsigned Elt : AllElts.set_bits()) { + // FIXME: ShrinkWrap2: 2x std::find_if. + auto HasElt = [&](const TargetResultSet &Res) { + return Res.empty() ? false : Res.test(Elt); + }; + auto Found1 = find_if(OldUses, HasElt); + auto Found2 = Found1 == OldUses.end() + ? Found1 + : std::find_if(std::next(Found1), OldUses.end(), HasElt); + if (Found1 != OldUses.end() && Found2 == OldUses.end()) { + // Gather all the saves. + MBBSet SavesElt(MF.getNumBlockIDs()); + for (auto &KV : Saves) { + unsigned MBBNum = KV.first; + const TargetResultSet &Elts = KV.second; + if (Elts.test(Elt)) + SavesElt.set(MBBNum); + } + + // Gather all the restores. + MBBSet RestoresElt(MF.getNumBlockIDs()); + for (auto &KV : Restores) { + unsigned MBBNum = KV.first; + const TargetResultSet &Elts = KV.second; + if (Elts.test(Elt)) + RestoresElt.set(MBBNum); + } + + // If we only have a single save and a single restore, keep it that way. + if (SavesElt.count() == 1 && RestoresElt.count() == 1) + continue; + + // Remove saves and restores from the maps. + for (unsigned MBBNum : SavesElt.set_bits()) + Saves[MBBNum].reset(Elt); + for (unsigned MBBNum : RestoresElt.set_bits()) + Restores[MBBNum].reset(Elt); + + // Add it to the unique block that uses it. + unsigned MBBNum = std::distance(OldUses.begin(), Found1); + for (auto *Map : {&Saves, &Restores}) { + TargetResultSet &Elts = (*Map)[MBBNum]; + if (Elts.empty()) + Elts.resize(SWI->getNumResultBits()); + Elts.set(Elt); + } + } + } + + // Remove all the empty entries from the Saves / Restores maps. + // FIXME: ShrinkWrap2: Should we even have empty entries? + SmallVector ToRemove; + for (auto *Map : {&Saves, &Restores}) { + for (auto It = Map->begin(), End = Map->end(); It != End; ++It) + if (It->second.count() == 0) + ToRemove.push_back(It); + for (auto It : ToRemove) + Map->erase(It); + ToRemove.clear(); + } +} + +unsigned ShrinkWrapper::computeShrinkWrappingCost( + MachineBlockFrequencyInfo *MBFI) const { + unsigned Cost = 0; + for (const MachineBasicBlock &MBB : MF) { + unsigned BlockCost = 0; + for (auto *Map : {&Saves, &Restores}) { + auto Found = Map->find(MBB.getNumber()); + if (Found != Map->end()) + BlockCost += Found->second.count(); + } + auto Frequency = + static_cast(MBFI->getBlockFreq(&MBB).getFrequency()) / + MBFI->getEntryFreq(); + Cost += BlockCost * Frequency * 100; + } + return Cost; +} + +unsigned +ShrinkWrapper::computeDefaultCost(MachineBlockFrequencyInfo *MBFI) const { + unsigned Cost = 0; + for (const MachineBasicBlock &MBB : MF) { + unsigned BlockCost = + &MBB == &MF.front() || MBB.isReturnBlock() ? AllElts.count() : 0; + auto Frequency = + static_cast(MBFI->getBlockFreq(&MBB).getFrequency()) / + MBFI->getEntryFreq(); + Cost += BlockCost * Frequency * 100; + } + return Cost; +} + +void ShrinkWrapper::verifySavesRestores() const { + auto HasElt = [&](const SparseBBResultSetMap &Map, unsigned Elt) { + return find_if(Map, [&](const std::pair &KV) { + return KV.second.test(Elt); + }) != Map.end(); + }; + + auto RestoresElt = [&](unsigned Elt) { return HasElt(Restores, Elt); }; + auto SavesElt = [&](unsigned Elt) { return HasElt(Saves, Elt); }; + + // Check that all the CSRs used in the function are saved at least once. + for (unsigned Elt : AllElts.set_bits()) + if (!SavesElt(Elt) && !RestoresElt(Elt)) + llvm_unreachable("Used CSR is never saved!"); + + // Check that there are no saves / restores in a loop. + for (const SparseBBResultSetMap *Map : {&Saves, &Restores}) + for (auto &KV : *Map) + if (SI.getSCCLoopFor(KV.first)) + llvm_unreachable("Save / restore in a loop."); + + // Keep track of the currently saved elements. + TargetResultSet Saved(SWI->getNumResultBits()); + // Cache the state of each call, to avoid redundant checks. + std::vector> Cache(MF.getNumBlockIDs()); + + // Verify if: + // * All the saves are restored. + // * All the restores are related to a store. + // * There are no nested stores. + std::function verifySavesRestoresRec = + [&](const MachineBasicBlock *MBB) { + unsigned MBBNum = MBB->getNumber(); + // Don't even check no-return blocks. + if (MBB->succ_empty() && !MBB->isReturnBlock()) { + VERBOSE_DEBUG(dbgs() << "IN: BB#" << MBBNum << " is an no-return\n"); + return; + } + + SmallVectorImpl &State = Cache[MBBNum]; + if (find(State, Saved) != State.end()) { + VERBOSE_DEBUG(dbgs() << "IN: BB#" << MBBNum << " already visited.\n"); + return; + } + + State.push_back(Saved); + + VERBOSE_DEBUG(dbgs() << "IN: BB#" << MBBNum << ": Save "; + for (unsigned Elt + : Saved.set_bits()) { + SWI->printElt(Elt, dbgs()); + dbgs() << " "; + } dbgs() + << '\n'); + + const TargetResultSet &SavesMBB = Saves.lookup(MBBNum); + const TargetResultSet &RestoresMBB = Restores.lookup(MBBNum); + + // Get the intersection of the currently saved elements and the + // elements to be saved for this basic block. If the intersection is + // not empty, it means we have nested saves for the same elements. + TargetResultSet Intersection(SavesMBB); + Intersection &= Saved; + + DEBUG(for (unsigned Elt + : Intersection.set_bits()) { + SWI->printElt(Elt, dbgs()); + dbgs() << " is saved twice.\n"; + }); + + assert(Intersection.count() == 0 && + "Nested saves for the same elements."); + Intersection.reset(); + + // Save the elements to be saved. + for (unsigned Elt : SavesMBB.set_bits()) { + Saved.set(Elt); + VERBOSE_DEBUG(dbgs() << "IN: BB#" << MBBNum << ": Save "; + SWI->printElt(Elt, dbgs()); dbgs() << ".\n"); + } + + // If the intersection of the currently saved elements and the + // elements to be restored for this basic block is not equal to the + // restores, it means we are trying to restore something that is not + // saved. + Intersection = RestoresMBB; + Intersection &= Saved; + + assert(Intersection.count() == RestoresMBB.count() && + "Not all restores are saved."); + + // Restore the elements to be restored. + for (int Elt : RestoresMBB.set_bits()) { + Saved.reset(Elt); + VERBOSE_DEBUG(dbgs() << "IN: BB#" << MBBNum << ": Restore "; + SWI->printElt(Elt, dbgs()); dbgs() << ".\n"); + } + + if (MBB->succ_empty() && Saved.count() != 0) + llvm_unreachable("Not all saves are restored."); + + // Using the current set of saved elements, walk all the successors + // recursively. + for (MachineBasicBlock *Succ : MBB->successors()) + verifySavesRestoresRec(Succ); + + // Restore the state prior of the function exit. + for (unsigned Elt : RestoresMBB.set_bits()) { + Saved.set(Elt); + VERBOSE_DEBUG(dbgs() << "OUT: BB#" << MBBNum << ": Save "; + SWI->printElt(Elt, dbgs()); dbgs() << ".\n"); + } + for (unsigned Elt : SavesMBB.set_bits()) { + Saved.reset(Elt); + VERBOSE_DEBUG(dbgs() << "OUT: BB#" << MBBNum << ": Restore "; + SWI->printElt(Elt, dbgs()); dbgs() << ".\n"); + } + }; + + verifySavesRestoresRec(&MF.front()); +} + +void ShrinkWrapper::emitRemarks(MachineOptimizationRemarkEmitter *ORE, + MachineBlockFrequencyInfo *MBFI) const { + unsigned Cost = computeShrinkWrappingCost(MBFI); + unsigned DefaultCost = computeDefaultCost(MBFI); + int Improvement = DefaultCost - Cost; + MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "ShrinkWrapped", {}, + &MF.front()); + R << "Shrink-wrapped function with cost " << ore::NV("ShrinkWrapCost", Cost) + << " which is " << ore::NV("ShrinkWrapCostImprovement", Improvement) + << " better than " + << ore::NV("OriginalShrinkWrapCost", DefaultCost) + << ", during which attributes were recomputed " + << ore::NV("ShrinkWrapRecomputed", AttributesRecomputed) << " times."; + ORE->emit(R); +} + +bool ShrinkWrapper::areResultsInteresting( + MachineBlockFrequencyInfo *MBFI) const { + if (!hasUses()) + return false; + if (Saves.size() == 1) { // If we have only one save, + unsigned MBBNum = Saves.begin()->first; + unsigned FrontMBBNum = MF.front().getNumber(); + const TargetResultSet &EltsSaved = Saves.begin()->second; + if (MBBNum == FrontMBBNum // and the save it's in the entry block, + && EltsSaved == AllElts) { // and it saves *ALL* the CSRs + DEBUG(dbgs() << "No shrink-wrapping performed, all saves in the entry " + "block.\n";); + return false; // then it's not interesting. + } + } + + // If the cost with shrink wrapping is better than the default, use it. + unsigned Cost = computeShrinkWrappingCost(MBFI); + unsigned DefaultCost = computeDefaultCost(MBFI); + if (Cost >= DefaultCost) + DEBUG(dbgs() << "No shrink-wrapping performed. ShrinkWrapCost: " << Cost + << ", DefaultCost: " << DefaultCost << '\n'); + return Cost < DefaultCost; +} + +void ShrinkWrapper::dumpResults() const { + for (unsigned MBBNum = 0; MBBNum < MF.getNumBlockIDs(); ++MBBNum) { + if (Saves.count(MBBNum) || Restores.count(MBBNum)) { + DEBUG(dbgs() << "BB#" << MBBNum << ": Saves: "); + auto Save = Saves.lookup(MBBNum); + DEBUG(for (unsigned Elt + : Save.set_bits()) { + SWI->printElt(Elt, dbgs()); + dbgs() << ", "; + }); + DEBUG(dbgs() << "| Restores: "); + auto Restore = Restores.lookup(MBBNum); + DEBUG(for (unsigned Elt + : Restore.set_bits()) { + SWI->printElt(Elt, dbgs()); + dbgs() << ", "; + }); + + DEBUG(dbgs() << '\n'); + } + } +} + +ShrinkWrapper::ShrinkWrapper(const MachineFunction &MF) + : ShrinkWrapper( + MF, + MF.getSubtarget().getFrameLowering()->createCSRShrinkWrapInfo(MF)) {} + +ShrinkWrapper::ShrinkWrapper(const MachineFunction &MF, + std::unique_ptr SW) + : MF(MF), Uses(MF.getNumBlockIDs()), SWI(std::move(SW)), SI(MF) { + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + if (ViewCFGDebug == cl::BOU_TRUE) + MF.viewCFGOnly(); + + VERBOSE_DEBUG(for (auto &SCC + : SI.SCCs) { + dbgs() << "SCCLoop: " << SCC.getNumber() << "\n Pred: "; + for (auto *Pred : SCC.Predecessors) + dbgs() << Pred->getNumber() << ", "; + dbgs() << "\n Succ: "; + for (auto *Succ : SCC.Successors) + dbgs() << Succ->getNumber() << ", "; + dbgs() << '\n'; + }); + + // FIXME: ShrinkWrap2: Remove. Call SWI directly. + determineUses(); + if (!hasUses()) + return; + + DEBUG(dumpUses()); + + // Don't bother saving if we know we're never going to return. + removeUsesOnNoReturnPaths(); + // FIXME: ShrinkWrap2: Check if there are any modifications before printing. + DEBUG(dbgs() << "**** After removing uses on no-return paths\n";); + DEBUG(dumpUses()); + + markUsesOutsideLoops(); + // FIXME: ShrinkWrap2: Check if there are any modifications before printing. + DEBUG(dbgs() << "**** After marking uses inside loops\n";); + DEBUG(dumpUses()); + + // FIXME: ShrinkWrap2: Find a better way to avoid treating added CSRs the same + // as original ones. This is needed for postProcessResults. + // FIXME: ShrinkWrap2: Probably just save / restore once per block if there + // is only one register from the beginning. + auto OldUses = Uses; + + AllElts.resize(SWI->getNumResultBits()); + for (const auto &Use : Uses) + AllElts |= Use; + + auto &EntryUses = Uses[MF.front().getNumber()]; + + // Compute the dataflow attributes described by Fred C. Chow. + AttributeMap Attrs; + // Reserve + emplace_back to avoid copies of empty bitvectors.. + unsigned Max = MF.getNumBlockIDs(); + Attrs.reserve(Max); + for (unsigned i = 0; i < Max; ++i) + Attrs.emplace_back(*SWI); + // For each register, compute the dataflow attributes. + // FIXME: ShrinkWrap2: Compute all elements at once. + ReversePostOrderTraversal RPOT(&MF); + for (unsigned Elt : AllElts.set_bits()) { + // If it's used in the entry block, don't even compute it. We know the + // results already. + if (!EntryUses.empty() && EntryUses.test(Elt)) + continue; + // Compute the attributes. + computeAttributes(Elt, Attrs, RPOT); + + // If we detected critical edges, compute again. + while (hasCriticalEdges(Elt, Attrs)) { + ++AttributesRecomputed; + computeAttributes(Elt, Attrs, RPOT); + } + + gatherAttributesResults(Elt, Attrs); + VERBOSE_DEBUG(dumpResults()); + } + + VERBOSE_DEBUG(dbgs() << "**** Analysis results\n";); + VERBOSE_DEBUG(dumpResults()); + + if (!EntryUses.empty()) { + Saves[MF.front().getNumber()] |= EntryUses; + for (const MachineBasicBlock &MBB : MF) { + // FIXME: ShrinkWrap2: EHFuncletEntry. + if (MBB.isReturnBlock()) + Restores[MBB.getNumber()] |= EntryUses; + } + } + postProcessResults(OldUses); + + DEBUG(dbgs() << "**** Shrink-wrapping results\n"); + // FIXME: ShrinkWrap2: Check if there are any modifications before printing. + DEBUG(dumpResults()); + +// FIXME: ShrinkWrap2: Remove NDEBUG. +#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) + verifySavesRestores(); +#endif // EXPENSIVE_CHECKS +} Index: lib/CodeGen/TargetPassConfig.cpp =================================================================== --- lib/CodeGen/TargetPassConfig.cpp +++ lib/CodeGen/TargetPassConfig.cpp @@ -39,6 +39,11 @@ using namespace llvm; +// FIXME: ShrinkWrap2: Keep the second one only. Move it from TPC when we +// decided that ShrinkWrapping is no longer a pass. +extern cl::opt EnableShrinkWrap2Opt; +static cl::opt ShrinkWrapPass("shrink-wrap-pass", cl::init(2), cl::Hidden, + cl::desc("Choose shrink-wrap-pass to use")); static cl::opt DisablePostRASched("disable-post-ra", cl::Hidden, cl::desc("Disable Post Regalloc Scheduler")); static cl::opt DisableBranchFold("disable-branch-fold", cl::Hidden, @@ -719,8 +724,10 @@ addPostRegAlloc(); // Insert prolog/epilog code. Eliminate abstract frame index references... - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None && ShrinkWrapPass == 1) { addPass(&ShrinkWrapID); + EnableShrinkWrap2Opt = cl::BOU_FALSE; + } // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only // do so if it hasn't been disabled, substituted, or overridden. Index: lib/Target/AArch64/AArch64FrameLowering.h =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.h +++ lib/Target/AArch64/AArch64FrameLowering.h @@ -36,6 +36,12 @@ void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + // FIXME: ShrinkWrap2: Delay the computation of NumRegsSpilled. + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, @@ -69,6 +75,16 @@ bool enableStackSlotScavenging(const MachineFunction &MF) const override; + // FIXME: ShrinkWrap2: We need this to call computeCalleeSaveRegisterParis + // before we spill them. + void + processValidCalleeSavedInfo(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; + + std::unique_ptr + createCSRShrinkWrapInfo(const MachineFunction &MF) const override; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; Index: lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64FrameLowering.cpp +++ lib/Target/AArch64/AArch64FrameLowering.cpp @@ -137,6 +137,201 @@ STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +static bool produceCompactUnwindFrame(const MachineFunction &MF); +static unsigned estimateRSStackSizeLimit(MachineFunction &MF); + +class AArch64CSRShrinkWrapInfo final : public ShrinkWrapInfo { + /// Number of bits the result needs. + unsigned NumCSRs = 0; + +public: + unsigned getNumResultBits() const override { return NumCSRs; } + + AArch64CSRShrinkWrapInfo(const MachineFunction &MF) : ShrinkWrapInfo(MF) { + + // All calls are tail calls in GHC calling conv, and functions have no + // prologue/epilogue. + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) + return; + + const AArch64RegisterInfo *RegInfo = + static_cast( + MF.getSubtarget().getRegisterInfo()); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + // Count the number of CSRs. + for (unsigned i = 0; CSRegs[i]; ++i) + ++NumCSRs; + + determineCSRUses(); + + // FIXME: ShrinkWrap2: This is a duplicate of determineCalleeSaves. We + // should split this into multiple functions, and remove all the side + // effects from here. + auto AFI = + const_cast(MF.getInfo()); + unsigned UnspilledCSGPR = AArch64::NoRegister; + unsigned UnspilledCSGPRIdx = static_cast(-1); + unsigned UnspilledCSGPRPaired = AArch64::NoRegister; + unsigned UnspilledCSGPRPairedIdx = static_cast(-1); + + // FIXME: ShrinkWrap2: This should be available later somehow. + BitVector SavedRegs(getNumResultBits()); + for (BitVector &BV : Uses) + SavedRegs |= BV; + + auto *EntrySaves = &Uses[MF.front().getNumber()]; + if (EntrySaves->empty()) + EntrySaves->resize(getNumResultBits()); + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + // The frame record needs to be created by saving the appropriate registers + if (TFI->hasFP(MF)) { + // The frame pointer needs to be used in the entry and return of a + // function, to prevent optimizations. + EntrySaves->set(AArch64::FP); + SavedRegs.set(AArch64::FP); + for (const MachineBasicBlock &MBB : MF) { + if (MBB.isReturnBlock()) { + BitVector &Use = Uses[MBB.getNumber()]; + if (Use.empty()) + Use.resize(getNumResultBits()); + Use.set(AArch64::FP); + EntrySaves = &Uses[MF.front().getNumber()]; + } + } + // FIXME: ShrinkWrap2: Should we let LR be shrink-wrapped? + // EntrySaves.set(AArch64::LR); + // SavedRegs.set(AArch64::LR); + } + + unsigned BasePointerReg = AArch64::NoRegister; + if (RegInfo->hasBasePointer(MF)) + BasePointerReg = RegInfo->getBaseRegister(); + + unsigned ExtraCSSpill = 0; + // Figure out which callee-saved registers to save/restore. + for (unsigned i = 0; CSRegs[i]; ++i) { + const unsigned Reg = CSRegs[i]; + const unsigned RegIdx = i; + + // Add the base pointer register to SavedRegs if it is callee-save. + if (Reg == BasePointerReg) { + EntrySaves->set(RegIdx); + SavedRegs.set(RegIdx); + // FIXME: ShrinkWrap2: gather the return blocks and re-use them. + for (const MachineBasicBlock &MBB : MF) { + if (MBB.isReturnBlock()) { + BitVector &Use = Uses[MBB.getNumber()]; + if (Use.empty()) + Use.resize(getNumResultBits()); + Use.set(RegIdx); + EntrySaves = &Uses[MF.front().getNumber()]; + } + } + } + + bool RegUsed = SavedRegs.test(RegIdx); + unsigned PairedReg = CSRegs[i ^ 1]; + unsigned PairedRegIdx = i ^ 1; + if (!RegUsed) { + if (AArch64::GPR64RegClass.contains(Reg) && + !RegInfo->isReservedReg(MF, Reg)) { + UnspilledCSGPR = Reg; + UnspilledCSGPRIdx = RegIdx; + UnspilledCSGPRPaired = PairedReg; + UnspilledCSGPRPairedIdx = PairedRegIdx; + } + continue; + } + + // MachO's compact unwind format relies on all registers being stored in + // pairs. + // FIXME: the usual format is actually better if unwinding isn't needed. + // FIXME: ShrinkWrap2: don't check if the paired register is saved if it's + // not a callee save. This can happen if we have an odd number of CSRs + // (like MostRegsCC). + if (produceCompactUnwindFrame(MF) && PairedRegIdx < NumCSRs && + !SavedRegs.test(PairedRegIdx)) { + SavedRegs.set(PairedRegIdx); + if (AArch64::GPR64RegClass.contains(PairedReg) && + !RegInfo->isReservedReg(MF, PairedReg)) + ExtraCSSpill = PairedReg; + } + } + + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + for (int RegIdx + : SavedRegs.set_bits()) dbgs() + << ' ' << PrintReg(CSRegs[RegIdx], RegInfo); + dbgs() << "\n";); + + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumRegsSpilled = SavedRegs.count(); + bool CanEliminateFrame = NumRegsSpilled == 0; + + // The CSR spill slots have not been allocated yet, so estimateStackSize + // won't include them. + MachineFrameInfo &MFI = const_cast(MF.getFrameInfo()); + unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled; + DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); + unsigned EstimatedStackSizeLimit = + estimateRSStackSizeLimit(const_cast(MF)); + bool BigStack = (CFSize > EstimatedStackSizeLimit); + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) + AFI->setHasStackFrame(true); + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. If we already spilled an extra callee-saved register + // above to keep the number of spills even, we don't need to do anything + // else here. + if (BigStack) { + if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) { + DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo) + << " to get a scratch register.\n"); + EntrySaves->set(UnspilledCSGPRIdx); + // FIXME: ShrinkWrap2: Mark it in the return blocks too. + SavedRegs.set(UnspilledCSGPRIdx); + // MachO's compact unwind format relies on all registers being stored in + // pairs, so if we need to spill one extra for BigStack, then we need to + // store the pair. + if (produceCompactUnwindFrame(MF)) { + EntrySaves->set(UnspilledCSGPRPairedIdx); + // FIXME: ShrinkWrap2: Mark it in the return blocks too. + SavedRegs.set(UnspilledCSGPRPairedIdx); + } + ExtraCSSpill = UnspilledCSGPRPaired; + NumRegsSpilled = SavedRegs.count(); + } + + // If we didn't find an extra callee-saved register to spill, create + // an emergency spill slot. + if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) { + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const TargetRegisterClass &RC = AArch64::GPR64RegClass; + unsigned Size = TRI->getSpillSize(RC); + unsigned Align = TRI->getSpillAlignment(RC); + int FI = MFI.CreateStackObject(Size, Align, false); + // FIXME: ShrinkWrap2: Temporary hack. Remove. + MFI.RS->addScavengingFrameIndex(FI); + DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI + << " as the emergency spill slot.\n"); + } + } + + // Round up to register pair alignment to avoid additional SP adjustment + // instructions. + AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); + } + + raw_ostream &printElt(unsigned Elt, raw_ostream &OS) const override { + auto &TRI = *MF.getSubtarget().getRegisterInfo(); + OS << PrintReg(TRI.getCalleeSavedRegs(&MF)[Elt], &TRI); + return OS; + } +}; + /// Look at each instruction that references stack frames and return the stack /// size limit beyond which some of these instructions will require a scratch /// register during their expansion later. @@ -364,6 +559,7 @@ // Convert callee-save register save/restore instruction to do stack pointer // decrement/increment to allocate/deallocate the callee-save stack area by // converting store/load to use pre/post increment version. +LLVM_ATTRIBUTE_USED // FIXME: ShrinkWrap2: Remove attribute when we reuse this. static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) { @@ -434,6 +630,8 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, unsigned LocalStackSize) { unsigned Opc = MI.getOpcode(); + if (Opc == TargetOpcode::CFI_INSTRUCTION) + return; (void)Opc; assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi || Opc == AArch64::STRXui || Opc == AArch64::STRDui || @@ -474,6 +672,7 @@ return; int NumBytes = (int)MFI.getStackSize(); + // FIXME: ShrinkWrap2: This is set by determineCalleeSaves. Seems wrong to me. if (!AFI->hasStackFrame()) { assert(!HasFP && "unexpected function without stack frame but with FP"); @@ -502,6 +701,7 @@ return; } + // FIXME: ShrinkWrap2: This is set by determineCalleeSaves. Seems wrong to me. auto CSStackSize = AFI->getCalleeSavedStackSize(); // All of the remaining stack allocations are for locals. AFI->setLocalStackSize(NumBytes - CSStackSize); @@ -512,8 +712,16 @@ MachineInstr::FrameSetup); NumBytes = 0; } else if (CSStackSize != 0) { + // FIXME: ShrinkWrap2: For now, we can't use push / pop for save / restore + // of CSR. + if (MFI.getShouldUseShrinkWrap2()) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -CSStackSize, + TII, MachineInstr::FrameSetup); + } + else { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII, -CSStackSize); + } NumBytes -= CSStackSize; } assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -527,6 +735,13 @@ fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize()); ++MBBI; } + if (CombineSPBump && + (MFI.getShouldUseShrinkWrap2() || MFI.getShouldUseStackShrinkWrap2())) { + for (MachineOperand *MO : AFI->getCSROffsetsToFix()) { + MachineInstr &MI = *MO->getParent(); + fixupCalleeSaveRestoreStackOffset(MI, AFI->getLocalStackSize()); + } + } if (HasFP) { // Only set up FP if we actually need to. Frame pointer is fp = sp - 16. int FPOffset = CSStackSize - 16; @@ -552,13 +767,30 @@ } // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) + if (!canUseRedZone(MF)) { // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, MachineInstr::FrameSetup); + // FIXME: ShrinkWrap2: If we have another stack allocation here, and we're + // using SP for all the non-entry/non-return blocks, we have to fixup our + // offsets emitted for the callee saved regs. The ideal would be to know + // if we have this extra local stack allocation when computing the + // offsets, but that information is not available yet at that point. + + // Another solution would be to actually use the FI operands as all the + // targets do, and let resolveFrameIndex do the job. + for (MachineOperand *MO : AFI->getCSROffsetsToFix()) + MO->setImm(MO->getImm() + NumBytes / 8); // This is SP-relative, it only + // occurs when we don't have a + // stack frame. Which means + // that the offset is unsigned + // and scaled, so we need to + // divide by 8. + } + if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); const unsigned NrBitsToZero = countTrailingZeros(Alignment); @@ -682,6 +914,10 @@ .setMIFlags(MachineInstr::FrameSetup); } + + // FIXME: ShrinkWrap2: We emit CFI when we emit the instructions. + if (MFI.getShouldUseShrinkWrap2()) + return; // Now emit the moves for whatever callee saved regs we have (including FP, // LR if those are saved). emitCalleeSavedFrameMoves(MBB, MBBI); @@ -758,9 +994,13 @@ auto CSStackSize = AFI->getCalleeSavedStackSize(); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + // FIXME: ShrinkWrap2: For now, we can't use push / pop for save / restore + // of CSR. + if (!MFI.getShouldUseShrinkWrap2()) { if (!CombineSPBump && CSStackSize != 0) convertCalleeSaveRestoreToSPPrePostIncDec( MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize); + } // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); @@ -782,6 +1022,12 @@ return; } + // FIXME: ShrinkWrap2: For now, we can't use push / pop for save / restore + // of CSR, so we have to restore SP manually. + if (MFI.getShouldUseShrinkWrap2()) { + emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, + CSStackSize, TII, MachineInstr::FrameDestroy); + } NumBytes -= CSStackSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); @@ -908,7 +1154,11 @@ return getKillRegState(!IsLiveIn); } -static bool produceCompactUnwindFrame(MachineFunction &MF) { +static bool produceCompactUnwindFrame(const MachineFunction &MF) { + // FIXME: ShrinkWrap2: Fix compact unwinding. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.getShouldUseShrinkWrap2()) + return false; const AArch64Subtarget &Subtarget = MF.getSubtarget(); AttributeList Attrs = MF.getFunction()->getAttributes(); return Subtarget.isTargetMachO() && @@ -916,22 +1166,6 @@ Attrs.hasAttrSomewhere(Attribute::SwiftError)); } -namespace { - -struct RegPairInfo { - unsigned Reg1 = AArch64::NoRegister; - unsigned Reg2 = AArch64::NoRegister; - int FrameIdx; - int Offset; - bool IsGPR; - - RegPairInfo() = default; - - bool isPaired() const { return Reg2 != AArch64::NoRegister; } -}; - -} // end anonymous namespace - static void computeCalleeSaveRegisterPairs( MachineFunction &MF, const std::vector &CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs) { @@ -946,10 +1180,13 @@ (void)CC; // MachO's compact unwind format relies on all registers being stored in // pairs. + // FIXME: ShrinkWrap2: Fix compact unwind format. + if (!MFI.getShouldUseShrinkWrap2()) { assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); + } unsigned Offset = AFI->getCalleeSavedStackSize(); for (unsigned i = 0; i < Count; ++i) { @@ -961,12 +1198,16 @@ RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1); // Add the next reg to the pair if it is in the same register class. + // FIXME: ShrinkWrap2: Creating real pairs during shrink-wrapping may have + // double save / restores, that can corrupt registers. + if (!MFI.getShouldUseShrinkWrap2()) { if (i + 1 < Count) { unsigned NextReg = CSI[i + 1].getReg(); if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) || (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg))) RPI.Reg2 = NextReg; } + } // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI // list to come in sorted by frame index so that we can issue the store @@ -974,22 +1215,29 @@ // // The order of the registers in the list is controlled by // getCalleeSavedRegs(), so they will always be in-order, as well. + // FIXME: ShrinkWrap2: Make it work with shrink-wrapping. + if (!MFI.getShouldUseShrinkWrap2()) { assert((!RPI.isPaired() || (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + } // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. + // FIXME: ShrinkWrap2: Fix compact unwind format. + if (!MFI.getShouldUseShrinkWrap2()) { assert((!produceCompactUnwindFrame(MF) || CC == CallingConv::PreserveMost || (RPI.isPaired() && ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) || RPI.Reg1 + 1 == RPI.Reg2))) && "Callee-save registers not saved as adjacent register pair!"); + } RPI.FrameIdx = CSI[i].getFrameIdx(); - if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { + // FIXME: ShrinkWrap2: We are never using pairs. + if (!MFI.getShouldUseShrinkWrap2() && Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) { // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. Offset -= 16; @@ -1000,6 +1248,13 @@ Offset -= RPI.isPaired() ? 16 : 8; assert(Offset % 8 == 0); RPI.Offset = Offset / 8; + + // FIXME: ShrinkWrap2: This is unused through the whole backend. Instead, we + // have the RegisterPairInfo. + MFI.setObjectSize(RPI.FrameIdx, 8); + MFI.setObjectOffset(RPI.FrameIdx, RPI.Offset); + + // FIXME: ShrinkWrap2: Check for out of bounds ofsets for STR/STUR/etc? assert((RPI.Offset >= -64 && RPI.Offset <= 63) && "Offset out of bounds for LDP/STP immediate"); @@ -1009,17 +1264,36 @@ } } +// FIXME: ShrinkWrap2: We need this here because we have to call +// computeCalleeSaveRegisterPairs once after frame indices have been assigned. +void AArch64FrameLowering::processValidCalleeSavedInfo( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + AArch64FunctionInfo *AFI = MF.getInfo(); + computeCalleeSaveRegisterPairs(MF, CSI, TRI, AFI->getRegPairs()); +} + bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + AArch64FunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - DebugLoc DL; - SmallVector RegPairs; - - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); const MachineRegisterInfo &MRI = MF.getRegInfo(); + DebugLoc DL; + SmallVectorImpl &RegPairs = AFI->getRegPairs(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + bool needsCFI = + MMI.hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry(); + // FIXME: ShrinkWrap2: We should always use AFI->getRegPairs(), or at least + // avoid calling computeCalleeSaveRegisterPair more than once. + if (!MFI.getShouldUseShrinkWrap2()) { + RegPairs.clear(); + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + } for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { @@ -1028,6 +1302,15 @@ unsigned Reg2 = RPI.Reg2; unsigned StrOpc; + // FIXME: ShrinkWrap2: Skip all the registers that are not related to this + // block. + if (MFI.getShouldUseShrinkWrap2()) { + if (find_if(CSI, [&](const CalleeSavedInfo &CS) { + return CS.getReg() == Reg1; + }) == CSI.end()) + continue; + } + // Issue sequence of spills for cs regs. The first spill may be converted // to a pre-decrement store later by emitPrologue if the callee-save stack // area allocation can't be combined with the local stack area allocation. @@ -1050,6 +1333,27 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); + // FIXME: ShrinkWrap2: We need to decide wether to use SP or FP-relative + // store / load here. In order to do that, we have several factors: + // * If we don't use shrink-wrapping, always use SP. + // * If we don't have a frame, always use SP. + // * If it's the entry block, do not use SP, because we might have SP + // adjustments before / after. + // * If we don't have a frame, and we have local variables, and we *have* to + // use SP, then we have to keep track of the offsets that are used to store + // / load the CSR, and update them during prologue emission, where we have + // the information about the local stack size. + bool isEntryBlock = &MF.front() == &MBB; + bool ShouldUseSP = !hasFP(*MBB.getParent()) || isEntryBlock; + int CSStackSize = AFI->getCalleeSavedStackSize(); + int Imm = -(CSStackSize - 16 - int(RPI.Offset) * 8) / 8; + if (MFI.getShouldUseShrinkWrap2() && !ShouldUseSP) { + if (StrOpc == AArch64::STRXui) + StrOpc = AArch64::STURXi; + else if (StrOpc == AArch64::STRDui) + StrOpc = AArch64::STURDi; + Imm *= 8; + } MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); if (!MRI.isReserved(Reg1)) MBB.addLiveIn(Reg1); @@ -1061,14 +1365,42 @@ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOStore, 8, 8)); } - MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit - .setMIFlag(MachineInstr::FrameSetup); + if (MFI.getShouldUseShrinkWrap2()) { + if (ShouldUseSP) { + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit + .setMIFlag(MachineInstr::FrameSetup); + MachineInstr *MI = MIB; + if (&MBB != &MF.front()) + AFI->getCSROffsetsToFix().push_back(&MI->getOperand(2)); + } else { + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::FP) + .addImm(Imm) // [sp, #offset*8], where factor*8 is implicit + .setMIFlag(MachineInstr::FrameSetup); + } + } else { + MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit + .setMIFlag(MachineInstr::FrameSetup); + } MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOStore, 8, 8)); + if (MFI.getShouldUseShrinkWrap2() && needsCFI) { + int64_t Offset = ((-(CSStackSize - 16 - int(RPI.Offset) * 8) / 8) - 2) * 8; + const MCRegisterInfo *MCRI = STI.getRegisterInfo(); + unsigned DwarfReg = MCRI->getDwarfRegNum(Reg1, true); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); + BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameSetup); } + } + return true; } @@ -1077,14 +1409,26 @@ const std::vector &CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + AArch64FunctionInfo *AFI = MF.getInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; - SmallVector RegPairs; + SmallVectorImpl &RegPairs = AFI->getRegPairs(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + const MCRegisterInfo *MRI = STI.getRegisterInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + bool needsCFI = + MMI.hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry(); if (MI != MBB.end()) DL = MI->getDebugLoc(); - computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + // FIXME: ShrinkWrap2: We should always use AFI->getRegPairs(), or at least + // avoid calling computeCalleeSaveRegisterPair more than once. + if (!MFI.getShouldUseShrinkWrap2()) { + RegPairs.clear(); + computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs); + } for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE; ++RPII) { @@ -1092,6 +1436,15 @@ unsigned Reg1 = RPI.Reg1; unsigned Reg2 = RPI.Reg2; + // FIXME: ShrinkWrap2: Skip all the registers that are not related to this + // block. + if (MFI.getShouldUseShrinkWrap2()) { + if (find_if(CSI, [&](const CalleeSavedInfo &CS) { + return CS.getReg() == Reg1; + }) == CSI.end()) + continue; + } + // Issue sequence of restores for cs regs. The last restore may be converted // to a post-increment load later by emitEpilogue if the callee-save stack // area allocation can't be combined with the local stack area allocation. @@ -1113,6 +1466,19 @@ dbgs() << ", " << RPI.FrameIdx+1; dbgs() << ")\n"); + // FIXME: ShrinkWrap2: See comment in spillCalleeSavedRegisters. + bool isReturnBlock = MBB.isReturnBlock(); + bool ShouldUseSP = !hasFP(*MBB.getParent()) || isReturnBlock; + int CSStackSize = AFI->getCalleeSavedStackSize(); + int Imm = -(CSStackSize - 16 - int(RPI.Offset) * 8) / 8; + if (MFI.getShouldUseShrinkWrap2() && !ShouldUseSP) { + if (LdrOpc == AArch64::LDRXui) + LdrOpc = AArch64::LDURXi; + else if (LdrOpc == AArch64::LDRDui) + LdrOpc = AArch64::LDURDi; + Imm *= 8; + } + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); if (RPI.isPaired()) { MIB.addReg(Reg2, getDefRegState(true)); @@ -1120,13 +1486,41 @@ MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1), MachineMemOperand::MOLoad, 8, 8)); } - MIB.addReg(Reg1, getDefRegState(true)) - .addReg(AArch64::SP) - .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit - .setMIFlag(MachineInstr::FrameDestroy); + if (MFI.getShouldUseShrinkWrap2()) { + if (ShouldUseSP) { + MIB.addReg(Reg1, getDefRegState(true)) + .addReg(AArch64::SP) + .addImm( + RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + MachineInstr *MI = MIB; + if (!MBB.isReturnBlock()) + AFI->getCSROffsetsToFix().push_back(&MI->getOperand(2)); + } else { + MIB.addReg(Reg1, getDefRegState(true)) + .addReg(AArch64::FP) + .addImm(Imm) // [sp, #offset*8], where factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + } + } else { + MIB.addReg(Reg1, getDefRegState(true)) + .addReg(AArch64::SP) + .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit + .setMIFlag(MachineInstr::FrameDestroy); + } + MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx), MachineMemOperand::MOLoad, 8, 8)); + + if (MFI.getShouldUseShrinkWrap2() && needsCFI) { + unsigned DwarfReg = MRI->getDwarfRegNum(Reg1, true); + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfReg)); + BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlag(MachineInstr::FrameDestroy); + } } return true; } @@ -1243,7 +1637,17 @@ // Round up to register pair alignment to avoid additional SP adjustment // instructions. +} + +bool AArch64FrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + // FIXME: ShrinkWrap2: This is only a hack to delay the computation of + // NumRegsSpilled. + AArch64FunctionInfo *AFI = MF.getInfo(); + unsigned NumRegsSpilled = CSI.size(); AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16)); + return false; } bool AArch64FrameLowering::enableStackSlotScavenging( @@ -1251,3 +1655,8 @@ const AArch64FunctionInfo *AFI = MF.getInfo(); return AFI->hasCalleeSaveStackFreeSpace(); } + +std::unique_ptr +AArch64FrameLowering::createCSRShrinkWrapInfo(const MachineFunction &MF) const { + return llvm::make_unique(MF); +} Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -637,6 +637,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, const LdStPairFlags &Flags) { + // FIXME: ShrinkWrap2: Add optimization remarks to see when we miss forming a + // pair. MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need Index: lib/Target/AArch64/AArch64MachineFunctionInfo.h =================================================================== --- lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -23,6 +23,21 @@ namespace llvm { +class MachineOperand; + +/// This contains register pairs computed for callee-save saves / restores. +struct RegPairInfo { + unsigned Reg1 = AArch64::NoRegister; + unsigned Reg2 = AArch64::NoRegister; + int FrameIdx; + int Offset; + bool IsGPR; + + RegPairInfo() = default; + + bool isPaired() const { return Reg2 != llvm::AArch64::NoRegister; } +}; + /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { @@ -44,6 +59,7 @@ /// HasStackFrame - True if this function has a stack frame. Set by /// determineCalleeSaves(). + // FIXME: ShrinkWrap2: This should not be set in determineCalleeSaves... bool HasStackFrame = false; /// \brief Amount of stack frame size, not including callee-saved registers. @@ -88,6 +104,17 @@ /// other stack allocations. bool CalleeSaveStackHasFreeSpace = false; + // FIXME: ShrinkWrap2: This should be replaced with MFI.Objects. + /// Register pairs computed for CSR save / restore. + SmallVector RegPairs; + + // FIXME: ShrinkWrap2: The offsets that probably need to be fixed are + // collected during spillCalleeSavedRegisters but need to be fixed during + // emitPrologue. + /// Machine operands representing SP-related offsets to CSRs, that need to be + /// fixed if local stack allocation happens afterwards. + SmallVector CSROffsetsToFix; + public: AArch64FunctionInfo() = default; @@ -116,6 +143,11 @@ CalleeSaveStackHasFreeSpace = s; } + SmallVectorImpl &getRegPairs() { return RegPairs; } + SmallVectorImpl &getCSROffsetsToFix() { + return CSROffsetsToFix; + } + bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -177,6 +177,9 @@ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool RestoreSP = false) const; + std::unique_ptr + createCSRShrinkWrapInfo(const MachineFunction &MF) const override; + private: uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -35,6 +35,92 @@ using namespace llvm; +class X86CSRShrinkWrapInfo final : public ShrinkWrapInfo { + /// Number of bits the result needs. + unsigned NumCSRs = 0; +public: + unsigned getNumResultBits() const override { return NumCSRs; } + + X86CSRShrinkWrapInfo(const MachineFunction &MF) : ShrinkWrapInfo(MF) { + bool Is64Bit = MF.getSubtarget().is64Bit(); + auto TRI = static_cast( + MF.getSubtarget().getRegisterInfo()); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + unsigned BasePtrIndex = static_cast(-1); + unsigned RBPIndex = static_cast(-1); + // Count the number of CSRs. + unsigned BasePtr = TRI->getBaseRegister(); + if (Is64Bit && BasePtr == X86::EBX) + BasePtr = X86::RBX; + unsigned FramePtr = TRI->getFramePtr(); + if (Is64Bit && FramePtr == X86::EBP) + FramePtr = X86::RBP; + // FIXME: ShrinkWrap2: Fix HHVM, which has only R12 as a CSR. + for (unsigned i = 0; CSRegs[i]; ++i) { + if (CSRegs[i] == FramePtr) + RBPIndex = i; + else if (CSRegs[i] == BasePtr) + BasePtrIndex = i; + ++NumCSRs; + } + + determineCSRUses(); + + // FIXME: ShrinkWrap2: const_cast + MachineFrameInfo &MFI = const_cast(MF.getFrameInfo()); + + // FIXME: ShrinkWrap2: This is a copy of the code in determineCalleeSaves. + // It also feels like there should not be any side effects done here. + // FIXME: ShrinkWrap2: const_cast + auto X86FI = const_cast( + MF.getInfo()); + int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); + auto SlotSize = TRI->getSlotSize(); + + if (TailCallReturnAddrDelta < 0) { + // create RETURNADDR area + // arg + // arg + // RETADDR + // { ... + // RETADDR area + // ... + // } + // [EBP] + MFI.CreateFixedObject(-TailCallReturnAddrDelta, + TailCallReturnAddrDelta - SlotSize, true); + } + + // Spill the BasePtr if it's used. + if (TRI->hasBasePointer(MF)) { + auto &SavedRegs = Uses[MF.front().getNumber()]; + if (SavedRegs.empty()) + SavedRegs.resize(getNumResultBits()); + SavedRegs.set(BasePtrIndex); + + // Allocate a spill slot for EBP if we have a base pointer and EH + // funclets. + if (MF.hasEHFunclets()) { + int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setHasSEHFramePtrSave(true); + X86FI->setSEHFramePtrSaveIndex(FI); + } + } + + // X86FrameLowering::EmitPrologue spills RBP manually. Remove it from the + // uses. + for (BitVector &BV : Uses) + if (!BV.empty()) + BV.reset(RBPIndex); + } + + raw_ostream &printElt(unsigned Elt, raw_ostream &OS) const override { + auto &TRI = *MF.getSubtarget().getRegisterInfo(); + OS << PrintReg(TRI.getCalleeSavedRegs(&MF)[Elt], &TRI); + return OS; + } +}; + X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride, @@ -1070,7 +1156,12 @@ if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; - NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); + NumBytes = FrameSize; + // FIXME: ShrinkWrap2: Since we disabled the push / pop spilling, we now + // have to include the callee saves in our frame size, so that our sp + // displacement can be updated properly. + if (!MFI.getShouldUseShrinkWrap2()) + NumBytes -= X86FI->getCalleeSavedFrameSize(); // Callee-saved registers are pushed on stack before the stack is realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) @@ -1128,7 +1219,12 @@ } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); - NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); + NumBytes = StackSize; + // FIXME: ShrinkWrap2: Since we disabled the push / pop spilling, we now + // have to include the callee saves in our frame size, so that our sp + // displacement can be updated properly. + if (!MFI.getShouldUseShrinkWrap2()) + NumBytes -= X86FI->getCalleeSavedFrameSize(); } // For EH funclets, only allocate enough space for outgoing calls. Save the @@ -1141,6 +1237,10 @@ bool PushedRegs = false; int StackOffset = 2 * stackGrowth; + // FIXME: Add CFI for all the callee saved registers. Since the saves / + // restores are not at the beginning of the function, we need to go through + // all the basic blocks. + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || @@ -1572,7 +1672,12 @@ } else if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; - NumBytes = FrameSize - CSSize; + NumBytes = FrameSize; + // FIXME: ShrinkWrap2: Since we disabled the push / pop spilling, we now + // have to include the callee saves in our frame size, so that our sp + // displacement can be updated properly. + if (!MFI.getShouldUseShrinkWrap2()) + NumBytes -= CSSize; // Callee-saved registers were pushed on stack before the stack was // realigned. @@ -1584,7 +1689,13 @@ TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) .setMIFlag(MachineInstr::FrameDestroy); } else { - NumBytes = StackSize - CSSize; + NumBytes = StackSize; + // FIXME: ShrinkWrap2: Since we disabled the push / pop spilling, we now + // have to include the callee saves in our frame size, so that our sp + // displacement can be updated properly. + if (!MFI.getShouldUseShrinkWrap2()) + NumBytes -= CSSize; + } uint64_t SEHStackAllocAmt = NumBytes; @@ -1645,6 +1756,12 @@ unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); uint64_t LEAAmount = IsWin64Prologue ? SEHStackAllocAmt - SEHFrameOffset : -CSSize; + // FIXME: ShrinkWrap2: Here, we can't assume we are going to pop all the + // callee saves (because we aren't, we actually move them back, then adjust + // the stack), so we just want to restore the stack pointer. This should go + // away at some point... + if (MFI.getShouldUseShrinkWrap2()) + LEAAmount = 0; // There are only two legal forms of epilogue: // - add SEHAllocationSize, %rsp @@ -1937,6 +2054,11 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { + // FIXME: ShrinkWrap2: Save using this function when it's adapted to work + // without push / pop. + if (MBB.getParent()->getFrameInfo().getShouldUseShrinkWrap2()) + return false; + DebugLoc DL = MBB.findDebugLoc(MI); // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI @@ -2003,6 +2125,11 @@ MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { + // FIXME: ShrinkWrap2: Restore using this function when it's adapted to work + // without push / pop. + if (MBB.getParent()->getFrameInfo().getShouldUseShrinkWrap2()) + return false; + if (CSI.empty()) return false; @@ -3039,3 +3166,8 @@ UnwindHelpFI) .addImm(-2); } + +std::unique_ptr +X86FrameLowering::createCSRShrinkWrapInfo(const MachineFunction &MF) const { + return llvm::make_unique(MF); +} Index: test/CodeGen/AArch64/ShrinkWrapping/AliasInRegMask.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/AliasInRegMask.mir @@ -0,0 +1,26 @@ +# RUN: llc -mtriple=aarch64-- -run-pass prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + declare void @f0() nounwind + define void @f1() nounwind { ret void } +... +--- +name: f1 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2, %bb.1 + + CBNZW %wzr, %bb.2 + B %bb.1 + + bb.1: + TCRETURNdi @f0, 0, csr_aarch64_aapcs, implicit %sp + + bb.2: + RET_ReallyLR +... +# Check that we don't look for aliased regs in RegMasks. + +# CHECK-LABEL: f1 +# CHECK-NOT: Uses: Index: test/CodeGen/AArch64/ShrinkWrapping/CFIStackFrame.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/CFIStackFrame.mir @@ -0,0 +1,28 @@ +# RUN: llc -filetype obj -mtriple=arm64-apple-ios10.3.0 -run-pass=prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + declare void @f0() nounwind + define void @f1() nounwind { ret void } +... +--- +name: f1 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + CBNZW %wzr, %bb.2 + B %bb.1 + + bb.1: + successors: %bb.2 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + BL @f0, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + + bb.2: + RET_ReallyLR +... +# CHECK-LABEL: f1 +# CHECK-NOT: Insufficient CFI instructions to define a frame! Index: test/CodeGen/AArch64/ShrinkWrapping/CSRUsedOnTerminator.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/CSRUsedOnTerminator.mir @@ -0,0 +1,52 @@ +# RUN: llc -mtriple=aarch64-- -run-pass prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + %nzcv = IMPLICIT_DEF + Bcc 0, %bb.1, implicit killed %nzcv + B %bb.2 + + bb.1: + RET_ReallyLR + + bb.2: + successors: %bb.3, %bb.4 + + %x21 = IMPLICIT_DEF + + %nzcv = IMPLICIT_DEF + Bcc 0, %bb.3, implicit killed %nzcv + B %bb.4 + + bb.3: + RET_ReallyLR + + bb.4: + liveins: %x21 + successors: %bb.5, %bb.6 + + CBZX killed %x21, %bb.5 + B %bb.6 + + bb.5: + RET_ReallyLR + + bb.6: + RET_ReallyLR +... +# Check that we mark uses in terminator instructions as used in all the successors as well. + +# CHECK-LABEL: f0 + +# CHECK: BB#2 uses : %X21 +# CHECK-NEXT: BB#4 uses : %X21 +# CHECK-NEXT: BB#5 uses : %X21 +# CHECK-NEXT: BB#6 uses : %X21 Index: test/CodeGen/AArch64/ShrinkWrapping/CompactUnwindingFPSPPair.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/CompactUnwindingFPSPPair.mir @@ -0,0 +1,44 @@ +# RUN: llc -filetype obj -mtriple=arm64-apple-ios10.3.0 -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o - 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +liveins: + - { reg: '%x1' } +body: | + bb.0: + successors: %bb.1, %bb.3 + liveins: %x1 + + %x19 = COPY %x1 + CBNZW %wzr, %bb.3 + B %bb.1 + + bb.1: + successors: %bb.2 + liveins: %x19 + + + bb.2: + successors: %bb.2, %bb.3 + liveins: %x19 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %x1 = COPY %x19 + BL @f0, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit undef %x0, implicit %x1, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + dead %xzr = SUBSXri undef %x8, 8, 0, implicit-def %nzcv + Bcc 12, %bb.2, implicit killed %nzcv + B %bb.3 + + bb.3: + RET_ReallyLR + +... +# Check that we're not trying to produce compact unwinding when FP and LR are split. + +# CHECK-LABEL: f0 +# CHECK-NOT: Pushing invalid registers for frame! Index: test/CodeGen/AArch64/ShrinkWrapping/DetermineCalleeSavesSideEffects.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/DetermineCalleeSavesSideEffects.mir @@ -0,0 +1,37 @@ +# RUN: llc -march=aarch64 -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false -run-pass=prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + declare void @f1() #0 + define void @f0() #1 { ret void } + + attributes #0 = { nounwind "target-cpu"="cortex-a57" } + attributes #1 = { nounwind "no-frame-pointer-elim-non-leaf" "target-cpu"="cortex-a57" } + +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + CBNZW %wzr, %bb.2 + B %bb.1 + + bb.1: + successors: %bb.2 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + BL @f1, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + + bb.2: + RET_ReallyLR +... +# Check that while we look for CSRs, we set the appropriate internal state of AArch64MachineFunction. + +# CHECK-LABEL: f0 +# CHECK-NOT: unexpected function without stacke frame but with FP +# CHECK: BB#1 uses : %LR +# CHECK: **** Shrink-wrapping results +# CHECK-NEXT: BB#1: Saves: %LR, | Restores: %LR, Index: test/CodeGen/AArch64/ShrinkWrapping/FirstMBBNum2.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/FirstMBBNum2.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=aarch64-- -O0 -global-isel -global-isel-abort=0 -verify-machineinstrs -enable-shrink-wrap2=true -debug-only=shrink-wrap2 %s -o - 2>&1 | FileCheck %s +; FIXME: ShrinkWrap2: use MIR once we fix stack protector assert. +; REQUIRES: asserts +; This test causes the first MBB ID to be 2, which provoked a bug. + +; CHECK-LABEL: ABIi128 + +; CHECK: BB#2 uses : %LR +; CHECK: **** Shrink-wrapping results +; CHECK-NEXT: BB#2: Saves: %LR, | Restores: %LR, + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--" + +define i128 @ABIi128(i128 %arg1) nounwind { + %res = fptoui fp128 undef to i128 + ret i128 %res +} Index: test/CodeGen/AArch64/ShrinkWrapping/NoPostPreLoadStore.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/NoPostPreLoadStore.mir @@ -0,0 +1,38 @@ +# RUN: llc -mtriple=arm64-apple-ios -debug-only=shrink-wrap2 -run-pass=prologepilog %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts + +--- | + define void @f0() nounwind { ret void } + declare void @f1() nounwind + declare void @f2() nounwind +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + CBNZW %wzr, %bb.2 + B %bb.1 + + bb.1: + successors: %bb.2 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + BL @f1, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + + bb.2: + TCRETURNdi @f2, 0, csr_aarch64_aapcs, implicit %sp + +... + +# This test makes sure that we don't convert callee save save / restores from +# store / load to pre / post increment load store. + +# CHECK-LABEL: f0 +# CHECK-NOT: This is not a register operand +# CHECK: BB#1 uses : %LR +# CHECK: **** Shrink-wrapping results +# CHECK-NEXT: BB#1: Saves: %LR, | Restores: %LR, Index: test/CodeGen/AArch64/ShrinkWrapping/NoStackObjects.mir =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ShrinkWrapping/NoStackObjects.mir @@ -0,0 +1,53 @@ +# RUN: llc -filetype obj -mtriple=arm64-apple-ios10.3.0 -run-pass=prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { entry: ret void } + declare void @f1() +... +--- +name: f0 +tracksRegLiveness: true +liveins: + - { reg: '%d0' } + - { reg: '%d1' } +body: | + bb.0: + successors: %bb.2, %bb.1 + liveins: %d0, %d1 + + dead %wzr = SUBSWri undef %w8, 0, 0, implicit-def %nzcv + Bcc 12, %bb.2, implicit killed %nzcv + B %bb.1 + + bb.1: + successors: %bb.4, %bb.3 + liveins: %d0, %d1 + + CBNZW %wzr, %bb.4 + B %bb.3 + + bb.2: + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %x3 = COPY %sp + BL @f1, csr_aarch64_aapcs_thisreturn, implicit-def dead %lr, implicit %sp, implicit undef %x0, implicit undef %x1, implicit undef %x2, implicit killed %x3, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + + bb.3: + successors: %bb.4 + liveins: %d0, %d1 + + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp + %x3 = COPY %sp + %w4 = MOVi32imm 70 + %w5 = COPY %wzr + BL @f1, csr_aarch64_aapcs_thisreturn, implicit-def dead %lr, implicit %sp, implicit undef %x0, implicit %d0, implicit %d1, implicit undef %x1, implicit undef %x2, implicit killed %x3, implicit undef %d2, implicit killed %w4, implicit killed %w5, implicit-def %sp + ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp + + bb.4: + %w0 = MOVi32imm 1 + RET_ReallyLR implicit killed %w0 +... +# Check that we don't use the stack objects in the AArch64 backend. + +# CHECK-LABEL: f0 +# CHECK-NOT: Getting frame offset for a dead object? Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll =================================================================== --- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll +++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll @@ -1,5 +1,6 @@ ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra < %s | FileCheck %s ; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios -disable-fp-elim -disable-post-ra < %s | FileCheck %s --check-prefix=CHECK-MACHO +; XFAIL: * ; This test aims to check basic correctness of frame layout & ; frame access code. There are 8 functions in this test file, @@ -660,6 +661,7 @@ ret void } +; FIXME: ShrinkWrap2: This fails because we don't combine the two sp displacements. ; CHECK-LABEL: realign_conditional ; No realignment in the prologue. ; CHECK-NOT: and Index: test/CodeGen/AArch64/alloca.ll =================================================================== --- test/CodeGen/AArch64/alloca.ll +++ test/CodeGen/AArch64/alloca.ll @@ -1,6 +1,9 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s ; RUN: llc -mtriple=arm64-apple-ios -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK-MACHO ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s +; XFAIL: * +; FIXME: ShrinkWrap2: This fails with shrink-wrapping enabled because we don't +; care about compact unwinding, and we don't force x20 to be spilled anyway. declare void @use_addr(i8*) Index: test/CodeGen/AArch64/arm64-aapcs-be.ll =================================================================== --- test/CodeGen/AArch64/arm64-aapcs-be.ll +++ test/CodeGen/AArch64/arm64-aapcs-be.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=false < %s | FileCheck %s ; RUN: llc -mtriple=aarch64_be-none-eabi -fast-isel=true < %s | FileCheck %s +; XFAIL: This test fails with shrink-wrapping enabled, because with pairs enabled, if we have only one register, it will have a 16B alignment, so we use 32B for the stack instead of just 8B. See computeCalleeSaveRegisterPairs. ; Check narrow argument passing via stack - callee end define i32 @test_narrow_args_callee(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, i8 %c, i16 %s) #0 { Index: test/CodeGen/AArch64/arm64-abi_align.ll =================================================================== --- test/CodeGen/AArch64/arm64-abi_align.ll +++ test/CodeGen/AArch64/arm64-abi_align.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-apple-darwin -mcpu=cyclone -enable-misched=false -disable-fp-elim | FileCheck %s ; RUN: llc < %s -mtriple=arm64-apple-darwin -O0 -disable-fp-elim | FileCheck -check-prefix=FAST %s +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping enabled because we +; don't combine SP updates. ; rdar://12648441 ; Generated from arm64-arguments.c with -O2. Index: test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll =================================================================== --- test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll +++ test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -1,4 +1,7 @@ ; RUN: llc -mtriple=arm64-eabi -mcpu=cyclone < %s | FileCheck %s +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping enabled because we +; don't save LR, since there are no calls. ; CHECK: foo ; CHECK: str w[[REG0:[0-9]+]], [x19, #264] Index: test/CodeGen/AArch64/arm64-dead-register-def-bug.ll =================================================================== --- test/CodeGen/AArch64/arm64-dead-register-def-bug.ll +++ test/CodeGen/AArch64/arm64-dead-register-def-bug.ll @@ -1,3 +1,4 @@ +; FIXME: ShrinkWrap2: .ll -> .mir when stack protector stuff is fixed. ; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s ; ; Check that the dead register definition pass is considering implicit defs. Index: test/CodeGen/AArch64/arm64-fp128.ll =================================================================== --- test/CodeGen/AArch64/arm64-fp128.ll +++ test/CodeGen/AArch64/arm64-fp128.ll @@ -1,4 +1,7 @@ ; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone -aarch64-enable-atomic-cfg-tidy=0 < %s | FileCheck %s +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping enabled because we +; insert a restore point between a cmp and a jump. @lhs = global fp128 zeroinitializer, align 16 @rhs = global fp128 zeroinitializer, align 16 Index: test/CodeGen/AArch64/arm64-hello.ll =================================================================== --- test/CodeGen/AArch64/arm64-hello.ll +++ test/CodeGen/AArch64/arm64-hello.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra -disable-fp-elim | FileCheck %s ; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping because we don't +; combine SP updates. ; CHECK-LABEL: main: ; CHECK: sub sp, sp, #32 Index: test/CodeGen/AArch64/arm64-join-reserved.ll =================================================================== --- test/CodeGen/AArch64/arm64-join-reserved.ll +++ test/CodeGen/AArch64/arm64-join-reserved.ll @@ -1,4 +1,7 @@ ; RUN: llc < %s -verify-machineinstrs | FileCheck %s +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping enabled because we +; don't spill x29, so we merge the store of x30 with wrz. target triple = "arm64-apple-macosx10" ; Make sure that a store to [sp] addresses off sp directly. Index: test/CodeGen/AArch64/arm64-large-frame.ll =================================================================== --- test/CodeGen/AArch64/arm64-large-frame.ll +++ test/CodeGen/AArch64/arm64-large-frame.ll @@ -1,4 +1,7 @@ ; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim -disable-post-ra < %s | FileCheck %s +; XFAIL: * +; FIXME: ShrinkWrap2: This test fails with shrink-wrapping enabled because we +; don't save LR. declare void @use_addr(i8*) @addr = global i8* null Index: test/CodeGen/X86/ShrinkWrapping/BasicBranch.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/BasicBranch.mir @@ -0,0 +1,37 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog -debug-only=shrink-wrap2 %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2, %bb.1 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.3 + + bb.2: + RET 0 + + bb.3: + %rbx = IMPLICIT_DEF + RET 0 +... +# Basic shrink-wrapping example. Early return with uses of CSRs in the body. +#CHECK-LABEL: f0 + +#CHECK: BB#1 uses : %RBX +#CHECK-NEXT: BB#3 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/CriticalEdge.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/CriticalEdge.mir @@ -0,0 +1,42 @@ +# RUN: llc -march=x86 -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# This is a reduced test case from test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.1: + RET 0 + + bb.2: + RET 0 + + bb.3: + successors: %bb.4, %bb.2 + + %esi = IMPLICIT_DEF + + %eflags = IMPLICIT_DEF + JGE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors:%bb.1, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.1, implicit killed %eflags + JMP_1 %bb.2 +... +#CHECK-LABEL: f0 + +#CHECK: BB#3 uses : %ESI +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#3: Saves: %ESI, | Restores: %ESI, Index: test/CodeGen/X86/ShrinkWrapping/CriticalEdge2.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/CriticalEdge2.mir @@ -0,0 +1,36 @@ +# RUN: llc -march=x86 -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# This is a reduced test case from test/CodeGen/X86/2007-08-09-IllegalX86-64Asm.ll +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.4, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.2 + + bb.2: + successors: %bb.3, %bb.4 + + %ebx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.3 + + bb.3: + RET 0 + + bb.4: + RET 0 +... +#CHECK-LABEL: f0 + +#CHECK: BB#1 uses : %EBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %EBX, | Restores: %EBX, Index: test/CodeGen/X86/ShrinkWrapping/CriticalEdgeLoop.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/CriticalEdgeLoop.mir @@ -0,0 +1,52 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# This is a reduced test case from test/CodeGen/X86/2009-04-27-CoalescerAssert.ll +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.3, %bb.1 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.4 + + JMP_1 %bb.4 + + bb.2: + + bb.3: + successors: %bb.4, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.6, %bb.5 + + %rbx = IMPLICIT_DEF + + %eflags = IMPLICIT_DEF + JE_1 %bb.6, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + RET 0 + + bb.6: + RET 0 + +... +#CHECK-LABEL: f0 + +#CHECK: BB#4 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#4: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/InfiniteLoop.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/InfiniteLoop.mir @@ -0,0 +1,36 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +body: | + bb.0: + successors: %bb.1, %bb.2 + liveins: %edi + + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + RET 0 + + bb.2: + successors: %bb.3 + + %rbx = IMPLICIT_DEF + + bb.3: + successors: %bb.3 + + JMP_1 %bb.3 +... +# Check that we don't save on a branch that never returns. +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK-NEXT: Remove uses from no-return BB#2 +#CHECK-NOT: Saves: +#CHECK-NOT: restores: Index: test/CodeGen/X86/ShrinkWrapping/IrreducibleCFG.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/IrreducibleCFG.mir @@ -0,0 +1,81 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.10, %bb.6 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.10, implicit killed %eflags + JMP_1 %bb.6 + + bb.1: + successors: %bb.6 + + JMP_1 %bb.6 + + bb.2: + successors: %bb.10 + + JMP_1 %bb.10 + + bb.3: + successors: %bb.4 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.4 + + bb.4: + successors: %bb.5, %bb.9 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.9, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + RET 0 + + bb.6: + successors: %bb.2, %bb.7 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.7 + + bb.7: + successors: %bb.4 + + JMP_1 %bb.4 + + bb.8: + successors: %bb.3, %bb.1 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.1 + + bb.9: + successors: %bb.4, %bb.8 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.8 + + bb.10: + successors: %bb.7 + + JMP_1 %bb.7 + +... +# Check that we handle irreducible loops and save / restore outside them. + +#CHECK-LABEL: f0 +#CHECK: BB#2 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#5: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/LoopBasic.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/LoopBasic.mir @@ -0,0 +1,61 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2, %bb.1 + + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.2: + RET 0 + + bb.3: + successors: %bb.4, %bb.5 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.6 + + JMP_1 %bb.6 + + bb.5: + successors: %bb.6 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.6 + + bb.6: + successors: %bb.7, %bb.3 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.7 + + bb.7: + RET 0 +... +# Check that we don't save inside loops. + +#CHECK-LABEL: f0 + +#CHECK: BB#5 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#7: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/LoopInCondition.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/LoopInCondition.mir @@ -0,0 +1,40 @@ +# RUN: llc -march=x86 -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# This is a reduced test case from test/CodeGen/X86/2007-11-06-InstrSched.ll. +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.3 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.4 + + JMP_1 %bb.4 + + bb.3: + successors: %bb.3, %bb.4 + + %esi = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JB_1 %bb.3, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + + RET 0 +... +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %ESI +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %ESI, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %ESI, Index: test/CodeGen/X86/ShrinkWrapping/LoopNoPreheader.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/LoopNoPreheader.mir @@ -0,0 +1,57 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + define void @f0() nounwind { + entry: + ret void + } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2, %bb.1 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.2: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.3: + successors: %bb.4 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.4 + + bb.4: + successors: %bb.3, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.5 + RET 0 + + bb.5: + + RET 0 + +... +# Check that we handle loops with no preheader. This should propagate through +# the loop's predecessors. + +#CHECK-LABEL: f0 + +#CHECK: BB#4 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#5: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/LoopNoPreheaderLatchExit.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/LoopNoPreheaderLatchExit.mir @@ -0,0 +1,53 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +# XFAIL: * +--- | + define void @f0() nounwind { + entry: + ret void + } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.2, %bb.1 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.2: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.3: + successors: %bb.4 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.4 + + bb.4: + successors: %bb.3 + + %eflags = IMPLICIT_DEF + JE_1 %bb.3, implicit killed %eflags + RET 0 + +... +# FIXME: ShrinkWrap2: This test still fails, since there is no way to place a +# restore outside a loop. This should not be possible in real code. + +#CHECK-LABEL: f0 + +#CHECK: BB#3 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/MultipleCriticalEdges.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/MultipleCriticalEdges.mir @@ -0,0 +1,50 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +--- | + define void @f0() nounwind { ret void } + +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.3 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.4, %bb.2 + + %ebx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.4 + + bb.2: + successors: %bb.4, %bb.3 + + %ebx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + RET 0 + + bb.3: + RET 0 + +... +# Check that we handle multiple critical edges. + +#CHECK-LABEL: f0 + +#CHECK: BB#1 uses : %RBX +#CHECK-NEXT: BB#2 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, +#CHECK-NEXT: BB#4: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/NestedLoopsCriticalEdges.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/NestedLoopsCriticalEdges.mir @@ -0,0 +1,64 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=stack-protector -run-pass=prologepilog %s -enable-shrink-wrap2=true -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# XFAIL: * +--- | + define void @f0() nounwind { + entry: + ret void + } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.6 + + %eflags = IMPLICIT_DEF + JE_1 %bb.1, implicit killed %eflags + JMP_1 %bb.6 + + bb.1: + successors: %bb.2, %bb.6 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.6 + + bb.2: + successors: %bb.3 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.3 + + bb.3: + successors: %bb.4 + JMP_1 %bb.4 + + bb.4: + successors: %bb.4, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + successors: %bb.6, %bb.3 + + %eflags = IMPLICIT_DEF + JE_1 %bb.6, implicit killed %eflags + JMP_1 %bb.3 + + bb.6: + RET 0 + +... +# Mix nested loops and critical edges. +# FIXME: ShrinkWrap2: This fails because we propagate attributes to the +# critical edges. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#2: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/NoReturnPath.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/NoReturnPath.mir @@ -0,0 +1,60 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=stack-protector -run-pass=prologepilog %s -enable-shrink-wrap2=true -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# This is a reduced test case from test/CodeGen/X86/2009-09-10-SpillComments.ll +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.6, %bb.1 + + %rbx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.6, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + liveins: %rbx + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.2 + + bb.2: + RET 0 + + bb.3: + successors: %bb.4 + liveins: %rbx + + bb.4: + successors: %bb.5, %bb.4 + liveins: %rbx + + %eflags = IMPLICIT_DEF + JNE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + successors: %bb.4 + liveins: %rbx + + %rbx = IMPLICIT_DEF + JMP_1 %bb.4 + + bb.6: + RET 0 +... +#CHECK-LABEL: f0 + +#CHECK: BB#0 uses : %RBX +#CHECK-NEXT: BB#5 uses : %RBX +#CHECK-NEXT: Remove uses from no-return BB#3 +#CHECK-NEXT: Remove uses from no-return BB#4 +#CHECK-NEXT: Remove uses from no-return BB#5 +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/Paper1Figure2CriticalEdge.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/Paper1Figure2CriticalEdge.mir @@ -0,0 +1,47 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3, %bb.4 + + %eflags = IMPLICIT_DEF + JE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.3 + + bb.2: + successors: %bb.4 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.4 + + bb.3: + RET 0 + + bb.4: + + %ebx = IMPLICIT_DEF + RET 0 +... +# Fig. 2 in Chow's paper. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK-NEXT: BB#4 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, +#CHECK-NEXT: BB#4: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/Paper2Figure1.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/Paper2Figure1.mir @@ -0,0 +1,56 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + JMP_1 %bb.3 + + bb.2: + successors: %bb.3 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.3 + + bb.3: + successors: %bb.5, %bb.4 + + %eflags = IMPLICIT_DEF + JE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.6 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.6 + + bb.5: + successors: %bb.6 + + JMP_1 %bb.6 + + bb.6: + RET 0 +... +# Fig 1 in Lupo and Wilken's paper. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK-NEXT: BB#4 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#2: Saves: %RBX, | Restores: %RBX, +#CHECK-NEXT: BB#4: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/Paper2Figure2.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/Paper2Figure2.mir @@ -0,0 +1,120 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.8 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.8, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.2, %bb.7 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.7, implicit killed %eflags + JMP_1 %bb.2 + + bb.2: + successors: %bb.3, %bb.5 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.3 + + bb.3: + successors: %bb.4, %bb.5 + + %ebx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.5 + + %ebx = MOV32ri 9 + JMP_1 %bb.5 + + bb.5: + successors: %bb.6, %bb.7 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.7, implicit killed %eflags + JMP_1 %bb.6 + + bb.6: + successors: %bb.7 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.7 + + bb.7: + successors: %bb.15 + + JMP_1 %bb.15 + + bb.8: + successors: %bb.9, %bb.10 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.10, implicit killed %eflags + JMP_1 %bb.9 + + bb.9: + successors: %bb.11 + + JMP_1 %bb.11 + + bb.10: + successors: %bb.11 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.11 + + bb.11: + successors: %bb.12, %bb.13 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.13, implicit killed %eflags + JMP_1 %bb.12 + + bb.12: + successors: %bb.14 + + JMP_1 %bb.14 + + bb.13: + successors: %bb.14 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.14 + + bb.14: + successors: %bb.15 + JMP_1 %bb.15 + + + bb.15: + RET 0 +... +# Fig 2 in Lupo and Wilken's paper. + +#CHECK-LABEL: f0 + +#CHECK: BB#3 uses : %RBX +#CHECK-NEXT: BB#4 uses : %RBX +#CHECK-NEXT: BB#6 uses : %RBX +#CHECK-NEXT: BB#10 uses : %RBX +#CHECK-NEXT: BB#13 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#7: Saves: | Restores: %RBX, +#CHECK-NEXT: BB#10: Saves: %RBX, | Restores: %RBX, +#CHECK-NEXT: BB#13: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/PropagateLoopUses.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/PropagateLoopUses.mir @@ -0,0 +1,113 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.15 + + JMP_1 %bb.15 + + bb.2: + successors: %bb.11 + + %r15 = IMPLICIT_DEF + %r14 = IMPLICIT_DEF + %rbx = IMPLICIT_DEF + JMP_1 %bb.11 + + bb.3: + successors: %bb.4, %bb.3 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.6 + liveins: %r14 + + %r14 = IMPLICIT_DEF + JMP_1 %bb.6 + + bb.5: + successors: %bb.6 + + JMP_1 %bb.6 + + bb.6: + successors: %bb.7 + + JMP_1 %bb.7 + + bb.7: + successors: %bb.8, %bb.9 + + %eflags = IMPLICIT_DEF + JA_1 %bb.8, implicit killed %eflags + JMP_1 %bb.9 + + bb.8: + successors: %bb.5, %bb.7 + + %eflags = IMPLICIT_DEF + JE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.7 + + bb.9: + successors: %bb.10, %bb.7 + liveins: %rbx + + %eflags = IMPLICIT_DEF + JE_1 %bb.7, implicit killed %eflags + JMP_1 %bb.10 + + bb.10: + successors: %bb.11 + + + bb.11: + successors: %bb.12, %bb.3 + + %eflags = IMPLICIT_DEF + JE_1 %bb.12, implicit killed %eflags + JMP_1 %bb.3 + + bb.12: + successors: %bb.13, %bb.14 + + %eflags = IMPLICIT_DEF + JE_1 %bb.14, implicit killed %eflags + + bb.13: + successors: %bb.15 + + JMP_1 %bb.15 + + bb.14: + RET 0 + + bb.15: + RET 0 +... +# Check that we propagate the loop uses to its predecessors and successors. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX, %R14, %R15 +#CHECK-NEXT: BB#10 uses : %R14 +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#2: Saves: %RBX, %R14, %R15, | Restores: %RBX, %R15 +#CHECK-NEXT: BB#13: Saves: | Restores: %R14 +#CHECK-NEXT: BB#14: Saves: | Restores: %R14 Index: test/CodeGen/X86/ShrinkWrapping/SCCCriticalEdge.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/SCCCriticalEdge.mir @@ -0,0 +1,48 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +# XFAIL: * +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.1, implicit killed %eflags + JMP_1 %bb.5 + + bb.1: + successors: %bb.2 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.2 + + bb.2: + successors: %bb.3 + + JMP_1 %bb.3 + + bb.3: + successors: %bb.3, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + RET 0 + +... +# FIXME: ShrinkWrap2: This still fails because we propagate attributes where we +# could not do it. + +#CHECK-LABEL: f0 + +#CHECK: BB#1 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#2: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/SaveBeforeLoop.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/SaveBeforeLoop.mir @@ -0,0 +1,58 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRE: asserts +# XFAIL: x86 +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.4 + + %eflags = IMPLICIT_DEF + JE_1 %bb.1, implicit killed %eflags + JMP_1 %bb.4 + + bb.1: + successors: %bb.2, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.5 + + bb.2: + successors: %bb.3 + + %rbx = IMPLICIT_DEF + JMP_1 %bb.3 + + bb.3: + successors: %bb.3, %bb.5 + + %eflags = IMPLICIT_DEF + JE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + successors: %bb.5, %bb.6 + + %eflags = IMPLICIT_DEF + JE_1 %bb.5, implicit killed %eflags + JMP_1 %bb.6 + + bb.4: + RET 0 + + bb.6: + RET 0 +... +# FIXME: ShrinkWrap2: This fails because we propagate attributes where we could +# avoid doing it. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#2: Saves: %RBX, | Restores: %RBX Index: test/CodeGen/X86/ShrinkWrapping/SimpleLoopBranch.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/SimpleLoopBranch.mir @@ -0,0 +1,40 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.3, %bb.2 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.2 + + bb.1: + successors: %bb.3, %bb.2 + + %rbx = IMPLICIT_DEF + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.2 + + bb.2: + successors: %bb.1 + + JMP_1 %bb.1 + + bb.3: + RET 0 +... +# Check that we don't save inside loops. + +#CHECK-LABEL: f0 + +#CHECK: BB#1 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#0: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/StackAlignment.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/StackAlignment.mir @@ -0,0 +1,37 @@ +# RUN: llc -disable-fp-elim -mtriple=x86_64-- -run-pass=prologepilog %s -o - | FileCheck %s +# REQUIRE: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +stack: + - { id: 0, offset: 0, size: 8, alignment: 8 } +body: | + bb.0: + successors: %bb.2, %bb.1 + + %eflags = IMPLICIT_DEF + JE_1 %bb.2, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.3 + + %rbx = IMPLICIT_DEF + %r14 = IMPLICIT_DEF + JMP_1 %bb.3 + + bb.2: + RET 0 + + bb.3: + liveins: %rbx + + %rax = MOV64rm %stack.0, %rbx, _, 0, _ + RET 0, %rax +... +# Check that we do the stack adjustments instead of pushes. +#CHECK-LABEL: f0 +#CHECK: %rsp = frame-setup SUB64ri8 %rsp, 16 Index: test/CodeGen/X86/ShrinkWrapping/Tree.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/Tree.mir @@ -0,0 +1,57 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=prologepilog %s -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.1, %bb.4 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.4, implicit killed %eflags + JMP_1 %bb.1 + + bb.1: + successors: %bb.2, %bb.3 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.2 + + bb.2: + %ebx = IMPLICIT_DEF + RET 0 + + bb.3: + %ebx = IMPLICIT_DEF + RET 0 + + bb.4: + successors: %bb.5, %bb.6 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.6, implicit killed %eflags + JMP_1 %bb.5 + + bb.5: + %ebx = IMPLICIT_DEF + RET 0 + + bb.6: + RET 0 +... +# Check that we save only on branches we need in a tree-like CFG. + +#CHECK-LABEL: f0 + +#CHECK: BB#2 uses : %RBX +#CHECK-NEXT: BB#3 uses : %RBX +#CHECK-NEXT: BB#5 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#1: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#2: Saves: | Restores: %RBX, +#CHECK-NEXT: BB#3: Saves: | Restores: %RBX, +#CHECK-NEXT: BB#5: Saves: %RBX, | Restores: %RBX, Index: test/CodeGen/X86/ShrinkWrapping/lit.local.cfg =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'X86' in config.root.targets: + config.unsupported = True Index: test/CodeGen/X86/ShrinkWrapping/optimize-max-0.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/ShrinkWrapping/optimize-max-0.mir @@ -0,0 +1,47 @@ +# RUN: llc -mtriple=x86_64-- -run-pass=stack-protector -run-pass=prologepilog %s -enable-shrink-wrap2=true -debug-only=shrink-wrap2 -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts +# XFAIL: x86 +--- | + define void @f0() nounwind { ret void } +... +--- +name: f0 +tracksRegLiveness: true +body: | + bb.0: + successors: %bb.6, %bb.3 + + %eflags = IMPLICIT_DEF + JE_1 %bb.6, implicit killed %eflags + JMP_1 %bb.3 + + bb.3: + successors: %bb.3, %bb.4 + + %eflags = IMPLICIT_DEF + JNE_1 %bb.3, implicit killed %eflags + JMP_1 %bb.4 + + bb.4: + successors: %bb.6 + + JMP_1 %bb.6 + + bb.6: + successors: %bb.8 + + %ebx = IMPLICIT_DEF + JMP_1 %bb.8 + + bb.8: + RET 0 + +... +# FIXME: ShrinkWrap2: This fails because we detect a critical edge. + +#CHECK-LABEL: f0 + +#CHECK: BB#3 uses : %RBX +#CHECK: **** Shrink-wrapping results +#CHECK-NEXT: BB#3: Saves: %RBX, | Restores: +#CHECK-NEXT: BB#4: Saves: | Restores: %RBX,