Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -93,6 +93,7 @@ void initializeCallGraphViewerPass(PassRegistry&); void initializeCallGraphWrapperPassPass(PassRegistry&); void initializeCodeGenPreparePass(PassRegistry&); +void initializeCodeSizeOutlinerLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstantPropagationPass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -186,6 +186,7 @@ (void) llvm::createPrintBasicBlockPass(os); (void) llvm::createModuleDebugInfoPrinterPass(); (void) llvm::createPartialInliningPass(); + (void) llvm::createCodeSizeOutlinerPass(); (void) llvm::createLintPass(); (void) llvm::createSinkingPass(); (void) llvm::createLowerAtomicPass(); Index: include/llvm/Transforms/IPO.h =================================================================== --- include/llvm/Transforms/IPO.h +++ include/llvm/Transforms/IPO.h @@ -207,6 +207,12 @@ ModulePass *createPartialInliningPass(); //===----------------------------------------------------------------------===// +/// createCodeSizeOutlinerPass - This pass outlines congruent parts of +/// functions. +/// +ModulePass *createCodeSizeOutlinerPass(); + +//===----------------------------------------------------------------------===// // createMetaRenamerPass - Rename everything with metasyntatic names. // ModulePass *createMetaRenamerPass(); Index: include/llvm/Transforms/IPO/CodeSizeOutliner.h =================================================================== --- /dev/null +++ include/llvm/Transforms/IPO/CodeSizeOutliner.h @@ -0,0 +1,30 @@ +//===-- CodeSizeOutliner.h - DCE unreachable internal functions ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transform outlines congruent chains of instructions from the current +// module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H +#define LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H + +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +/// Pass to remove unused function declarations. +class CodeSizeOutlinerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -63,6 +63,7 @@ #include "llvm/Transforms/GCOVProfiler.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" +#include "llvm/Transforms/IPO/CodeSizeOutliner.h" #include "llvm/Transforms/IPO/ConstantMerge.h" #include "llvm/Transforms/IPO/CrossDSOCFI.h" #include "llvm/Transforms/IPO/DeadArgumentElimination.h" Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -39,6 +39,7 @@ #define MODULE_PASS(NAME, CREATE_PASS) #endif MODULE_PASS("always-inline", AlwaysInlinerPass()) +MODULE_PASS("codesizeoutliner", CodeSizeOutlinerPass()) MODULE_PASS("constmerge", ConstantMergePass()) MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass()) MODULE_PASS("deadargelim", DeadArgumentEliminationPass()) Index: lib/Transforms/IPO/CMakeLists.txt =================================================================== --- lib/Transforms/IPO/CMakeLists.txt +++ lib/Transforms/IPO/CMakeLists.txt @@ -2,6 +2,7 @@ AlwaysInliner.cpp ArgumentPromotion.cpp BarrierNoopPass.cpp + CodeSizeOutliner.cpp ConstantMerge.cpp CrossDSOCFI.cpp DeadArgumentElimination.cpp Index: lib/Transforms/IPO/CodeSizeOutliner.cpp =================================================================== --- /dev/null +++ lib/Transforms/IPO/CodeSizeOutliner.cpp @@ -0,0 +1,1727 @@ +//===-- CodeSizeOutliner.cpp - Propagate constants through calls -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple algorithm for outlining congruent chains of +// instructions from the current module. +// +//===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/CodeSizeOutliner.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Bitcode/BitcodeWriterPass.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/Outliner.h" +#include + +using namespace llvm; + +static cl::opt MinOccurrences( + "cso-min-occurrences", cl::init(2), cl::Hidden, + cl::desc( + "Min number of occurrences to consider a candidate for outlining.")); +static cl::opt MinInstructionLength( + "cso-min-instructions", cl::init(1), cl::Hidden, + cl::desc( + "Min number of instructions to consider a candidate for outlining.")); +static cl::opt + MinBenefit("cso-min-benefit", cl::init(1), cl::Hidden, + cl::desc("Min estimated benefit to be considered profitable.")); +static cl::opt + EnableOutliner("enable-cso", cl::init(false), cl::Hidden, + cl::desc("Enable outlining for code size.")); +static cl::opt DumpCC( + "cso-dump-cc", cl::init(false), cl::Hidden, + cl::desc("Dump information about the congruent between instructions.")); + +#define DEBUG_TYPE "codesizeoutliner" +STATISTIC(NumOccurrencesOutlined, "Number of occurrences outlined"); +STATISTIC(NumCandidatesOutlined, "Number of outlined functions created"); + +namespace { +/// \brief The information necessary to create an outlined function from a set +/// of repeated instruction occurrences. +struct AdditionalCandidateData : public OutlineCandidate::AdditionalData { + /// Inputs into this candidate : Vector. + std::vector> InputSeq; + + /// Outputs from this candidate. + SparseBitVector<> Outputs; + + /// The index of the output to fold into a return. + unsigned OutputToFold = -1; +}; + +/// \brief GVN Expression with relaxed equivalency constraints. +class RelaxedExpression { +public: + /// A special state for special equivalence constraints. + enum SpecialState { StructGep, ConstrainedCall, None }; + + RelaxedExpression(Instruction &I) + : Inst(&I), SS(SpecialState::None), HashVal(0) { + // Check the special state. + /// Struct geps require constant indices. + if (GetElementPtrInst *GEP = dyn_cast(&I)) { + if (gepContainsStructType(GEP)) + SS = RelaxedExpression::StructGep; + } else if (CallInst *CI = dyn_cast(&I)) { + /// Don't take the address of inline asm calls. + if (CI->isInlineAsm()) + SS = RelaxedExpression::ConstrainedCall; + /// Intrinsics and functions without exact definitions can not + /// have their address taken. + else if (Function *F = CI->getCalledFunction()) { + if (!F->hasExactDefinition()) + SS = RelaxedExpression::ConstrainedCall; + } + } + } + bool equals(const RelaxedExpression &OE) const { + if (SS != OE.SS) + return false; + // Handle calls separately to allow for mismatched tail calls. + if (CallInst *CI = dyn_cast(Inst)) { + const CallInst *RCI = cast(OE.Inst); + if (CI->getFunctionType() != RCI->getFunctionType() || + CI->getNumOperands() != RCI->getNumOperands()) + return false; + if (CI->getCallingConv() != RCI->getCallingConv() || + CI->getAttributes() != RCI->getAttributes() || + !CI->hasIdenticalOperandBundleSchema(*RCI)) + return false; + } else if (!Inst->isSameOperationAs(OE.Inst, + Instruction::CompareIgnoringAlignment)) + return false; + return checkSpecialEquivalence(OE); + } + // Special checks for instructions that have non generic equivalency. + bool checkSpecialEquivalence(const RelaxedExpression &Other) const { + const Instruction *OE = Other.Inst; + switch (Inst->getOpcode()) { + case Instruction::ShuffleVector: { + // LangRef : The shuffle mask operand is required to be a + // constant vector with either constant integer or undef values. + return Inst->getOperand(2) == OE->getOperand(2); + } + case Instruction::Call: { + const CallInst *CI = cast(Inst); + const CallInst *OECI = cast(OE); + if (SS == ConstrainedCall) + return checkConstrainedCallEquivalence(CI, OECI); + break; + } + case Instruction::GetElementPtr: { + // Struct indices must be constant. + if (SS == StructGep) + return compareStructIndices(cast(Inst), + cast(OE)); + break; + } + default: + break; + } + return true; + } + static bool checkConstrainedCallEquivalence(const CallInst *CI, + const CallInst *OECI) { + const Value *CIVal = CI->getCalledValue(); + if (CIVal != OECI->getCalledValue()) + return false; + if (const Function *CalledII = dyn_cast(CIVal)) { + switch (CalledII->getIntrinsicID()) { + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::memset: + /// Alignment. + return CI->getArgOperand(3) == OECI->getArgOperand(3) && + /// Volatile flag. + CI->getArgOperand(4) == OECI->getArgOperand(4); + case Intrinsic::objectsize: + /// Min. + return CI->getArgOperand(1) == OECI->getArgOperand(1) && + /// Null unknown. + CI->getArgOperand(2) == OECI->getArgOperand(2); + case Intrinsic::expect: + /// Expected value. + return CI->getArgOperand(1) == OECI->getArgOperand(1); + case Intrinsic::prefetch: + /// RW. + return CI->getArgOperand(1) == OECI->getArgOperand(1) && + /// Locality. + CI->getArgOperand(2) == OECI->getArgOperand(2) && + /// Cache Type. + CI->getArgOperand(3) == OECI->getArgOperand(3); + case Intrinsic::ctlz: + case Intrinsic::cttz: + /// Is Zero Undef + return CI->getArgOperand(1) == OECI->getArgOperand(1); + default: + break; + } + } + return true; + } + + static bool compareStructIndices(GetElementPtrInst *L, + const GetElementPtrInst *R) { + SmallVector GepIdxs(L->indices().begin(), L->indices().end()); + GepIdxs.pop_back(); + while (!GepIdxs.empty()) { + Type *PtrTy = L->getGEPReturnType(L->getPointerOperand(), GepIdxs); + if (PtrTy->isPointerTy() && + PtrTy->getPointerElementType()->isStructTy()) { + unsigned OpNo = GepIdxs.size() + 1; + if (L->getOperand(OpNo) != R->getOperand(OpNo)) + return false; + } + GepIdxs.pop_back(); + } + return true; + } + // Check to see if the provided gep /p GEP indexes a struct type. + bool gepContainsStructType(GetElementPtrInst *GEP) { + Type *PtrTy = GEP->getPointerOperandType(); + if (PtrTy->isPointerTy() && PtrTy->getPointerElementType()->isStructTy()) + return true; + + SmallVector GepIdxs(GEP->indices().begin(), + GEP->indices().end()); + GepIdxs.pop_back(); + while (!GepIdxs.empty()) { + PtrTy = GEP->getGEPReturnType(GEP->getPointerOperand(), GepIdxs); + if (PtrTy->isPointerTy() && PtrTy->getPointerElementType()->isStructTy()) + return true; + GepIdxs.pop_back(); + } + return false; + } + + hash_code getComputedHash() const { + if (static_cast(HashVal) == 0) + HashVal = getHashValue(); + return HashVal; + } + hash_code getHashValue() const { + SmallVector HashRange; + HashRange.push_back(SS); + HashRange.push_back(Inst->getNumOperands()); + HashRange.push_back(reinterpret_cast(Inst->getType())); + for (auto &Op : Inst->operands()) + HashRange.emplace_back(reinterpret_cast(Op->getType())); + return hash_combine_range(HashRange.begin(), HashRange.end()); + } + +private: + Instruction *Inst; + SpecialState SS; + mutable hash_code HashVal; +}; +}; // namespace + +namespace llvm { +// DenseMapInfo for the relaxed expression class. +template <> struct DenseMapInfo { + static const RelaxedExpression *getEmptyKey() { + auto Val = static_cast(-1); + Val <<= + PointerLikeTypeTraits::NumLowBitsAvailable; + return reinterpret_cast(Val); + } + static const RelaxedExpression *getTombstoneKey() { + auto Val = static_cast(~1U); + Val <<= + PointerLikeTypeTraits::NumLowBitsAvailable; + return reinterpret_cast(Val); + } + static unsigned getHashValue(const RelaxedExpression *E) { + return E->getComputedHash(); + } + static bool isEqual(const RelaxedExpression *LHS, + const RelaxedExpression *RHS) { + if (LHS == RHS) + return true; + if (LHS == getTombstoneKey() || RHS == getTombstoneKey() || + LHS == getEmptyKey() || RHS == getEmptyKey()) + return false; + // Compare hashes before equality. This is *not* what the hashtable does, + // since it is computing it modulo the number of buckets, whereas we are + // using the full hash keyspace. Since the hashes are precomputed, this + // check is *much* faster than equality. + if (LHS->getComputedHash() != RHS->getComputedHash()) + return false; + return LHS->equals(*RHS); + } +}; +} // namespace llvm + +namespace { +/// \brief Helper struct containing mapping information for a module. +class IRInstructionMapper : public SequencedOutlinerImpl::OutlinerMapper { +public: + virtual ~IRInstructionMapper() { ExpressionAllocator.Reset(); } + + /// + /// Instruction Information Utilities. + /// + + /// \brief Holds information about a particular instruction. + struct InstructionInfo { + /// The inputs/operands going into this instruction. + SmallVector InputIndexes; + + /// The index of the farthest use of this instruction in the same block + /// parent. + unsigned FarthestInSameBlockOutput = 0; + + /// The size cost of this instruction. + unsigned Cost = -1; + }; + + // Get the instruction at index /p Idx. + Instruction *getInstr(unsigned Idx) { return InstrVec[Idx]; } + // Get the index of /p I inside of the internal vector. + unsigned getInstrIdx(Instruction *I) { + auto It = InstructionToIdxMap.find(I); + return LLVM_UNLIKELY(It == InstructionToIdxMap.end()) ? -1 : It->second; + } + // Get the parent function of the instruction at /p Idx. + Function *getInstrFunction(size_t Idx) { + assert(Idx < InstrVec.size() && "Invalid instruction index"); + return InstrVec[Idx]->getFunction(); + } + // Get or compute the cost of the instruction at /p InstrIdx. + unsigned getInstrCost(TargetTransformInfo &TTI, size_t InstrIdx) { + InstructionInfo *Info = InstrInfo[InstrIdx]; + if (Info->Cost == unsigned(-1)) { + Instruction *I = InstrVec[InstrIdx]; + switch (I->getOpcode()) { + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::URem: + Info->Cost = TargetTransformInfo::TCC_Basic * 2; + break; + default: + Info->Cost = TTI.getUserCost(I); + break; + } + // Be conservative about the cost of loads given they may be folded. + // Estimating a lower cost helps to prevent over estimating the + // benefit of this instruction. + if (isa(I)) + --Info->Cost; + // We give a little bonus to tail calls as they are likely to be more + // beneficial + // within our outlined function. + if (CallInst *CI = dyn_cast(I)) + if (CI->isTailCall()) + ++Info->Cost; + } + return Info->Cost; + } + // Get the instruction info attached to index /p InstrIdx + InstructionInfo &getInstrInfo(size_t InstrIdx) { + InstructionInfo *Info = InstrInfo[InstrIdx]; + assert(Info && "Queried instruction has no info created."); + return *Info; + } + // Create instruction info for the instruction at /p InstrIdx + void createInstrInfo(size_t InstrIdx) { + InstructionInfo *&Info = InstrInfo[InstrIdx]; + assert(!Info && "Instruction info already generated."); + Info = new (InfoAllocator.Allocate()) InstructionInfo(); + Instruction *Inst = InstrVec[InstrIdx]; + + /// Inputs. + unsigned NumOperands = Inst->getNumOperands(); + Info->InputIndexes.reserve(NumOperands); + for (unsigned InIt = 0; InIt < NumOperands; ++InIt) { + unsigned IIdx = 0; + Value *Op = Inst->getOperand(InIt); + assert(!isa(Op) && "Basic block inputs can not be handled."); + if (Instruction *I = dyn_cast(Op)) { + unsigned FoundIIdx = getInstrIdx(I); + if (FoundIIdx <= InstrIdx) + IIdx = FoundIIdx; + } + Info->InputIndexes.emplace_back(IIdx); + } + + /// Outputs. + for (User *Usr : Inst->users()) { + Instruction *I = dyn_cast(Usr); + if (!I || I->getParent() != Inst->getParent()) { + Info->FarthestInSameBlockOutput = -1; + break; + } + unsigned IIdx = getInstrIdx(I); + if (IIdx > Info->FarthestInSameBlockOutput) + Info->FarthestInSameBlockOutput = IIdx; + } + } + + /// + /// Module Mapping Utilities. + /// + + // Map the module /p M to prepare for outlining. + void mapModule(Module &M, ProfileSummaryInfo *PSI, + function_ref GetBFI) { + bool HasProfileData = PSI->hasProfileSummary(); + + // Insert illegal ID at the front to act as a sentinel. + InstrVec.push_back(nullptr); + CCVec.push_back(IllegalID--); + + for (Function &F : M) { + if (!F.hasExactDefinition() || F.hasGC() || F.isDefTriviallyDead()) + continue; + if (F.hasFnAttribute(Attribute::OptimizeNone)) + continue; + // Don't outline non -Oz functions without profile data. + bool FavorSize = F.optForMinSize(); + if (!FavorSize && !HasProfileData) + continue; + BlockFrequencyInfo *BFI = HasProfileData ? &GetBFI(F) : nullptr; + for (BasicBlock &BB : F) { + if (BB.isEHPad()) + continue; + + // Skip hot blocks if we have profile data. + if (HasProfileData) + if (FavorSize ? PSI->isHotBB(&BB, BFI) : !PSI->isColdBB(&BB, BFI)) + continue; + + // Try to map each instruction to a congruency id. + for (Instruction &I : BB) { + // Ignore debug info intrinsics. + if (isa(&I)) + continue; + InstrVec.push_back(&I); + if (canMapInstruction(&I)) { + InstructionToIdxMap.try_emplace(&I, CCVec.size()); + CCVec.push_back(mapInstruction(I)); + } else + CCVec.push_back(IllegalID--); + } + } + } + InstrInfo.assign(InstrVec.size(), nullptr); + if (DumpCC) + dumpCC(M); + } + // Get the number of instructions mapped in this module. + virtual unsigned getNumMappedInstructions() const override { + return InstrVec.size(); + } + +private: + // Map the instruction /p I to a congruency class. + unsigned mapInstruction(Instruction &I) { + // We map each valid instruction to a Relaxed expression and use this for + // detecting congruency. + auto *E = new (ExpressionAllocator) RelaxedExpression(I); + // Assign a CC id to this instruction. + auto ItPair = GlobalCC[I.getOpcode()].try_emplace(E, CCID); + if (ItPair.second) + ++CCID; + return ItPair.first->second; + } + // Checks to see if a provided instruction /p I can be mapped. + bool canMapInstruction(Instruction *I) { + if (I->getOpcode() == Instruction::Call) { + CallInst *CI = cast(I); + // Be very conservative about musttail because it has additional + // guarantees that must be met. + if (CI->isMustTailCall()) + return false; + // Be conservative about return twice calls. + if (CI->canReturnTwice()) + return false; + CallSite CS(CI); + switch (CS.getIntrinsicID()) { + case Intrinsic::objectsize: + case Intrinsic::expect: + case Intrinsic::prefetch: + // Lib C functions are fine. + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::sqrt: + case Intrinsic::pow: + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::fma: + case Intrinsic::fabs: + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + // Bit manipulation intrinsics. + case Intrinsic::bitreverse: + case Intrinsic::bswap: + case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + return true; + // Non intrinsics are fine if they don't have an inalloca arg. + case Intrinsic::not_intrinsic: + return !CS.hasInAllocaArgument(); + default: + return false; + } + return true; + } + return !(isa(I) || isa(I) || isa(I)); + } + // Dump the mapped congruencies found for the module /p M. + void dumpCC(Module &M) { + for (Function &F : M) { + dbgs() << "function : " << F.getName() << "\n"; + for (auto &BB : F) { + dbgs() << "block : " << BB.getName() << "\n"; + for (Instruction &I : BB) { + auto It = InstructionToIdxMap.find(&I); + size_t Idx = It == InstructionToIdxMap.end() ? -1 : CCVec[It->second]; + dbgs() << "-- " << Idx << " : " << I << '\n'; + } + } + } + for (auto &CCIt : GlobalCC) { + for (auto CC : CCIt) { + size_t Idx = CC.second; + dbgs() << "-- Examining CC ID : " << Idx << "\n"; + for (size_t i = 0, e = CCVec.size(); i < e; ++i) + if (CCVec[i] == Idx) + dbgs() << " - " << *InstrVec[i] << "\n"; + dbgs() << "\n"; + } + } + } + + /// Stores location of instructions mapped to the corresponding index in + /// the CCVec. + std::vector InstrVec; + + /// Stores information for parallel instruction in InstrVec. + std::vector InstrInfo; + SpecificBumpPtrAllocator InfoAllocator; + + /// Map + DenseMap InstructionToIdxMap; + + /// Mapping of expression to congruency id. + std::array, + Instruction::OtherOpsEnd> + GlobalCC; + + /// Memory management for the GVN expressions used for congruency. + mutable BumpPtrAllocator ExpressionAllocator; +}; + +/// \brief Cache output allocas created during outlining and reuse them. +class OutputAllocaCache { +public: + // Get an alloca for type /p Ty in function /p F that has an array + // count of atleast /p Count. + AllocaInst *getFreeAlloca(Function *F, Type *Ty, unsigned Count) { + if (AllocaInst *NewAI = findAndUpdate(F, Ty, Count)) + return NewAI; + // Create the new alloca inst. + Instruction *AllocaInsertPnt = &F->getEntryBlock().front(); + Value *AllocSize = + ConstantInt::get(Type::getInt32Ty(Ty->getContext()), Count); + unsigned AddrSpace = Ty->isPointerTy() ? Ty->getPointerAddressSpace() : 0; + AllocaInst *NewAlloca = + new AllocaInst(Ty, AddrSpace, AllocSize, "", AllocaInsertPnt); + DenseMap &TyMap = FunctionToAllocaCache[F]; + TyMap.try_emplace(Ty, NewAlloca); + return NewAlloca; + } + +private: + // Try to find an existing alloca an update the count if needed. + AllocaInst *findAndUpdate(Function *F, Type *Ty, unsigned Count) { + AllocaInst *AI = nullptr; + DenseMap &TyMap = FunctionToAllocaCache[F]; + auto It = TyMap.find(Ty); + if (It != TyMap.end()) { + AI = It->second; + ConstantInt *CI = cast(AI->getArraySize()); + if (Count > CI->getZExtValue()) + AI->setOperand(0, ConstantInt::get(CI->getType(), Count)); + } + return AI; + } + + /// Map> + DenseMap> FunctionToAllocaCache; +}; + +/// \brief A specific instance of an outlined candidate. +struct FunctionSplicer { + FunctionSplicer(bool EmitProfileData) : EmitProfileData(EmitProfileData) { + // Prepare the function attributes that we don't want to inherit from any + // parents. + NonInheritAttrs.addAttribute(Attribute::AlwaysInline); + NonInheritAttrs.addAttribute(Attribute::ArgMemOnly); + NonInheritAttrs.addAttribute(Attribute::InaccessibleMemOnly); + NonInheritAttrs.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); + NonInheritAttrs.addAttribute(Attribute::InlineHint); + NonInheritAttrs.addAttribute(Attribute::Naked); + NonInheritAttrs.addAttribute(Attribute::NoInline); + NonInheritAttrs.addAttribute(Attribute::NoRecurse); + NonInheritAttrs.addAttribute(Attribute::ReturnsTwice); + NonInheritAttrs.addAttribute(Attribute::ReadOnly); + NonInheritAttrs.addAttribute(Attribute::ReadNone); + NonInheritAttrs.addAttribute(Attribute::NoReturn); + NonInheritAttrs.addAttribute(Attribute::WriteOnly); + } + // Reset the outliner to prepare for a new instance. + void prepareForNewInstance(OutlineCandidate &OC, IRInstructionMapper &OM) { + OutlinedFn = nullptr; + OutputTypeCount.clear(); + OutputGepIdx.clear(); + InitialStartIdx = OC.getOccurrence(0); + AdditionalCandidateData &Data = OC.getData(); + if (Data.Outputs.count() < 2) + return; + // Collect information about the outputs of this new candidate. + // Output parameters are condensed by type, so we need to build the + // offset from the base pointer for each output. + for (size_t OutputIdx : Data.Outputs) { + if (OutputIdx == Data.OutputToFold) + continue; + Type *OutTy = OM.getInstr(InitialStartIdx + OutputIdx)->getType(); + unsigned GepIdx, TyArgNo; + auto TyCountIt = OutputTypeCount.find(OutTy); + if (TyCountIt == OutputTypeCount.end()) { + GepIdx = 0; + TyArgNo = OutputTypeCount.size(); + OutputTypeCount.try_emplace(OutTy, 1, TyArgNo); + } else { + GepIdx = TyCountIt->second.first++; + TyArgNo = TyCountIt->second.second; + } + OutputGepIdx.try_emplace(OutputIdx, TyArgNo, GepIdx); + } + } + + // Outline a new occurrence of an outline chain. + void outlineOccurrence(OutlineCandidate &OC, size_t StartIdx, + IRInstructionMapper &OM) { + ++NumOccurrencesOutlined; + Instruction *Tail = OM.getInstr(StartIdx + (OC.Len - 1)); + Function *ParentFn = Tail->getFunction(); + bool InitialOccur = !OutlinedFn; + AdditionalCandidateData &Data = OC.getData(); + + /// Split the outline chain into its own block. + Instruction *Head = OM.getInstr(StartIdx); + BasicBlock *EntryBlock = Head->getParent(); + // Split our chain instance into a separate block for extraction. + /// Split up to the head if we aren't at the front of our block. + if (Head != &EntryBlock->front() || + EntryBlock == &ParentFn->getEntryBlock()) + EntryBlock = EntryBlock->splitBasicBlock(Head->getIterator()); + /// Split after the tail. + auto SentinalIt = Tail->getNextNode()->getIterator(); + BasicBlock *Exit = EntryBlock->splitBasicBlock(SentinalIt); + + // Create a new block to patch the outlined section. + BasicBlock *OutlineBlock = + BasicBlock::Create(ParentFn->getContext(), "cso.patch", ParentFn); + + // Create parameter vec for the new call. + unsigned NumOutputs = Data.Outputs.count(); + std::vector Args; + Args.reserve(Data.InputSeq.size() + NumOutputs); + + // Build inputs/outputs in order. + for (size_t i = 0, e = Data.InputSeq.size(); i < e; ++i) { + size_t InstrIdx, OpNo; + std::tie(InstrIdx, OpNo) = Data.InputSeq[i]; + Args.push_back(OM.getInstr(StartIdx + InstrIdx)->getOperand(OpNo)); + } + + // Replace uses of outputs and create reloads. + if (NumOutputs > 0) { + SmallVector OutputArgs(OutputTypeCount.size()); + for (auto &ArgPair : OutputTypeCount) { + unsigned ArgNo, Count; + std::tie(Count, ArgNo) = ArgPair.second; + OutputArgs[ArgNo] = OAC.getFreeAlloca(ParentFn, ArgPair.first, Count); + } + Args.insert(Args.end(), OutputArgs.begin(), OutputArgs.end()); + for (size_t OutputIdx : Data.Outputs) { + if (OutputIdx == Data.OutputToFold) + continue; + Instruction *Out = OM.getInstr(StartIdx + OutputIdx); + unsigned ArgNo, GepIdx; + std::tie(ArgNo, GepIdx) = OutputGepIdx[OutputIdx]; + Value *OutputArg = OutputArgs[ArgNo]; + if (GepIdx != 0) { + Value *Idx = + ConstantInt::get(Type::getInt32Ty(Out->getContext()), GepIdx); + OutputArg = GetElementPtrInst::CreateInBounds( + Out->getType(), OutputArg, Idx, "", OutlineBlock); + } + Instruction *Reload = new LoadInst(OutputArg, "", OutlineBlock); + Out->replaceUsesOutsideBlock(Reload, EntryBlock); + } + if (!InitialOccur) { + Instruction *Out = OM.getInstr(StartIdx + Data.OutputToFold); + Out->removeFromParent(); + Out->dropAllReferences(); + } + } + + // Replace branches to entry block. + EntryBlock->replaceAllUsesWith(OutlineBlock); + BranchInst::Create(Exit, OutlineBlock); + + // Get the first valid debug info. + DebugLoc CallLoc; + if (ParentFn->getSubprogram()) + for (auto It = Head->getIterator(), E = EntryBlock->end(); + !CallLoc && It != E; ++It) + CallLoc = It->getDebugLoc(); + + // If this is the first occurrence then we outline the chain, otherwise we + // erase the entry block because it's dead. + if (OutlinedFn) { + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_nonnull, + LLVMContext::MD_invariant_group, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_fpmath, + LLVMContext::MD_dereferenceable_or_null}; + + // Merge special state. + for (unsigned InitI = InitialStartIdx, CurI = StartIdx, + InitE = InitialStartIdx + OC.Len; + InitI < InitE; ++InitI, ++CurI) { + Instruction *InitII = OM.getInstr(InitI); + Instruction *CurII = OM.getInstr(CurI); + // Make sure the alignment is valid as we skip it during congruency + // finding. + if (LoadInst *LI = dyn_cast(InitII)) + LI->setAlignment(std::min(LI->getAlignment(), + cast(CurII)->getAlignment())); + else if (StoreInst *SI = dyn_cast(InitII)) + SI->setAlignment(std::min(SI->getAlignment(), + cast(CurII)->getAlignment())); + // Make sure that no tails are propagated properly. + else if (CallInst *CI = dyn_cast(InitII)) { + auto TCK = cast(CurII)->getTailCallKind(); + if (TCK == CallInst::TCK_NoTail) + CI->setTailCallKind(CallInst::TCK_NoTail); + else if (TCK != CallInst::TCK_None && !CI->isNoTailCall()) + CI->setTailCallKind(TCK); + } + // Be conservative about flags like nsw/nuw. + else + InitII->andIRFlags(OM.getInstr(CurI)); + + // Merge metadata. + DebugLoc DL = InitII->getDebugLoc(); + if (DL && DL != CurII->getDebugLoc()) + InitII->setDebugLoc(DebugLoc()); + combineMetadata(InitII, CurII, KnownIDs); + } + EntryBlock->eraseFromParent(); + AttributeFuncs::mergeAttributesForInlining(*OutlinedFn, *ParentFn); + } else + outlineInitialOccurrence(OC, OM, StartIdx, Args, EntryBlock); + + // Create the patchup for the outline section. + CallInst *CI = + CallInst::Create(OutlinedFn, Args, "", &OutlineBlock->front()); + if (Data.OutputToFold != unsigned(-1)) { + Instruction *FoldedOut = OM.getInstr(StartIdx + Data.OutputToFold); + if (InitialOccur) + FoldedOut->replaceUsesOutsideBlock(CI, FoldedOut->getParent()); + else { + FoldedOut->replaceAllUsesWith(CI); + FoldedOut->deleteValue(); + } + } + CI->setDebugLoc(CallLoc); + } + + // \brief Finalize this function as an outline instance. In this step we + // re-unique the inputs to the function. + void finalize(OutlineCandidate &OC) { + FunctionType *CurFnTy = OutlinedFn->getFunctionType(); + AdditionalCandidateData &Data = OC.getData(); + unsigned NumInputs = Data.InputSeq.size(); + + // Map to + DenseMap ArgNoToCG, RhsArgNoToCG; + std::vector ArgCongruencyGroups, RhsArgCGroups; + + // Helper function to collect the information on the use of inputs in the + // outlined function. This identifies which arguments are actually congruent + // to each other for a specific function call. + auto CollectInputInfo = [&](CallInst *CI, + DenseMap &ANoToCG, + std::vector &ArgCGroups) { + ArgCGroups.reserve(NumInputs); + for (unsigned i = 0, e = NumInputs; i < e; ++i) { + // We already evaluated the equivalencies for this arg. + if (ANoToCG.count(i)) + continue; + Value *Op = CI->getOperand(i); + BitVector CurrentGroup(NumInputs); + CurrentGroup.set(i); + ANoToCG.try_emplace(i, ArgCGroups.size()); + for (unsigned j = i + 1; j < e; ++j) { + if (ANoToCG.count(j)) + continue; + if (CI->getArgOperand(j) == Op) { + CurrentGroup.set(j); + ANoToCG.try_emplace(j, ArgCGroups.size()); + } + } + ArgCGroups.emplace_back(std::move(CurrentGroup)); + } + }; + + // Build initial equivalences from the first call. + auto UserIt = OutlinedFn->user_begin(); + CallInst *FirstCI = cast(*UserIt); + ++UserIt; + CollectInputInfo(FirstCI, ArgNoToCG, ArgCongruencyGroups); + + // If we have the same amount of congruency groups as we do arguments, + // the they are already unique. + if (NumInputs == ArgCongruencyGroups.size()) + return; + // Check every other user to see if the equivalencies hold up. + BitVector ResolvedInputs(NumInputs); + // BitVector helper to hold non congruent matches between argument groups. + BitVector NonCongruentLeft; + for (auto UserE = OutlinedFn->user_end(); UserIt != UserE; ++UserIt) { + CallInst *CI = cast(*UserIt); + RhsArgCGroups.clear(); + RhsArgNoToCG.clear(); + CollectInputInfo(CI, RhsArgNoToCG, RhsArgCGroups); + ResolvedInputs.reset(); + for (unsigned i = 0, e = NumInputs; i < e; ++i) { + if (ResolvedInputs.test(i)) + continue; + BitVector &LhsGroup = ArgCongruencyGroups[ArgNoToCG[i]]; + BitVector &RhsGroup = RhsArgCGroups[RhsArgNoToCG[i]]; + + /// Build non congruent arguments between the groups. + NonCongruentLeft = LhsGroup; + /// Congruent matches. + LhsGroup &= RhsGroup; + assert(LhsGroup.count() > 0); + + // Non congruent sets on both sides still act as a congruency group. + NonCongruentLeft ^= LhsGroup; + + // Mark arguments as handled. + for (unsigned SetBit : LhsGroup.set_bits()) + ResolvedInputs.set(SetBit); + if (NonCongruentLeft.count() == 0) + continue; + + // Move the non congruent matches from the left group to a + // new congruency group. + unsigned NewGroupId = ArgCongruencyGroups.size(); + ArgCongruencyGroups.emplace_back(std::move(NonCongruentLeft)); + + // Move non congruent matches to a new congruency group + // and remove them from the top level mapping. + for (unsigned SetBit : ArgCongruencyGroups.back().set_bits()) + ArgNoToCG[SetBit] = NewGroupId; + } + } + + // No inputs can be condensed. + if (NumInputs == ArgCongruencyGroups.size()) + return; + + // Build new function from the condensed inputs. + std::vector NewFnTys; + for (auto &It : ArgCongruencyGroups) + NewFnTys.push_back(CurFnTy->getParamType(It.find_first())); + for (unsigned i = NumInputs, e = CurFnTy->getNumParams(); i < e; ++i) + NewFnTys.push_back(CurFnTy->getParamType(i)); + + // Create the merged function. + FunctionType *NewFnTy = + FunctionType::get(CurFnTy->getReturnType(), NewFnTys, false); + Function *MergedFn = Function::Create(NewFnTy, OutlinedFn->getLinkage(), "", + OutlinedFn->getParent()); + MergedFn->takeName(OutlinedFn); + MergedFn->copyAttributesFrom(OutlinedFn); + DenseMap ValMap; + + // Remap the arguments. + auto ArgI = OutlinedFn->arg_begin(); + for (size_t i = 0, e = NumInputs; i < e; ++i, ++ArgI) + ValMap[&*ArgI] = std::next(MergedFn->arg_begin(), ArgNoToCG[i]); + auto MergeArgI = + std::next(MergedFn->arg_begin(), ArgCongruencyGroups.size()); + for (size_t i = NumInputs, e = CurFnTy->getNumParams(); i < e; + ++i, ++ArgI, ++MergeArgI) { + ValMap[&*ArgI] = &*MergeArgI; + } + /// Move Fn Body. + MergedFn->getBasicBlockList().splice(MergedFn->begin(), + OutlinedFn->getBasicBlockList()); + /// Remap arguments. + for (Argument &A : OutlinedFn->args()) { + auto NewArgIt = ValMap.find(&A); + if (NewArgIt != ValMap.end()) + A.replaceAllUsesWith(NewArgIt->second); + } + + // Rewrite the calls to this function with calls to the merged function. + std::vector CallArgs; + for (auto U = OutlinedFn->use_begin(), E = OutlinedFn->use_end(); U != E;) { + CallInst *CI = dyn_cast(U->getUser()); + ++U; + CallArgs.clear(); + /// Map call args by their congruency group. + for (auto &It : ArgCongruencyGroups) + CallArgs.push_back(CI->getArgOperand(It.find_first())); + /// Outputs are retained, except for the promoted value. + for (unsigned i = NumInputs, e = CurFnTy->getNumParams(); i < e; ++i) + CallArgs.push_back(CI->getArgOperand(i)); + CallInst *NewCall = CallInst::Create(MergedFn, CallArgs, "", CI); + NewCall->setDebugLoc(CI->getDebugLoc()); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); + } + OutlinedFn->eraseFromParent(); + } + +private: + // \brief Outline the initial occurrence of this chain. + void outlineInitialOccurrence(OutlineCandidate &OC, IRInstructionMapper &OM, + size_t StartIdx, ArrayRef Args, + BasicBlock *Entry) { + /// Function type for outlined function. + std::vector Tys; + Tys.reserve(Args.size()); + for (Value *Arg : Args) + Tys.push_back(Arg->getType()); + LLVMContext &Ctx = Entry->getContext(); + AdditionalCandidateData &Data = OC.getData(); + Instruction *FoldedOut = nullptr; + Type *OutputTy = Type::getVoidTy(Ctx); + if (Data.OutputToFold != unsigned(-1)) { + FoldedOut = OM.getInstr(StartIdx + Data.OutputToFold); + OutputTy = FoldedOut->getType(); + } + FunctionType *FTy = FunctionType::get(OutputTy, Tys, false); + Function *ParentFn = Entry->getParent(); + + /// Create function and move blocks. + OutlinedFn = Function::Create(FTy, GlobalValue::PrivateLinkage, "cso", + Entry->getModule()); + OutlinedFn->getBasicBlockList().splice( + OutlinedFn->end(), Entry->getParent()->getBasicBlockList(), + Entry->getIterator()); + OutlinedFn->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); + Entry->getTerminator()->eraseFromParent(); + ReturnInst::Create(Ctx, FoldedOut, Entry); + + // FIXME: Fix this if we can have merged debug info. + if (ParentFn->getSubprogram()) { + for (auto It = Entry->begin(), E = Entry->end(); It != E;) { + Instruction &I = *It; + ++It; + if (isa(I)) + Entry->getInstList().erase(I.getIterator()); + } + } + + // FIXME: Ideally we should compute the real count for this function but + // for now we just tag it as cold. + if (EmitProfileData) + OutlinedFn->addFnAttr(Attribute::Cold); + + /// Create stores for any output variables. + auto OutputArgBegin = OutlinedFn->arg_begin(); + std::advance(OutputArgBegin, Data.InputSeq.size()); + for (size_t OutputIdx : Data.Outputs) { + if (OutputIdx == Data.OutputToFold) + continue; + unsigned ArgNo, GepIdx; + std::tie(ArgNo, GepIdx) = OutputGepIdx[OutputIdx]; + Value *OutputArg = &*std::next(OutputArgBegin, ArgNo); + Instruction *Out = OM.getInstr(StartIdx + OutputIdx); + Instruction *InstPtr = Out->getNextNode(); + if (GepIdx != 0) { + Value *Idx = + ConstantInt::get(Type::getInt32Ty(Out->getContext()), GepIdx); + OutputArg = GetElementPtrInst::CreateInBounds(Out->getType(), OutputArg, + Idx, "", InstPtr); + } + new StoreInst(Out, OutputArg, InstPtr); + } + + /// Replace input operands with function arguments. + auto ArgI = OutlinedFn->arg_begin(); + for (auto &InputPair : Data.InputSeq) { + size_t InputInstIdx, OpNo; + std::tie(InputInstIdx, OpNo) = InputPair; + Instruction *InputInst = OM.getInstr(StartIdx + InputInstIdx); + InputInst->setOperand(OpNo, &*ArgI++); + } + + /// Inherit attributes. + OutlinedFn->addAttributes( + AttributeList::FunctionIndex, + ParentFn->getAttributes().getAttributes(AttributeList::FunctionIndex)); + OutlinedFn->removeAttributes(AttributeList::FunctionIndex, NonInheritAttrs); + OutlinedFn->removeFnAttr(Attribute::AllocSize); + } + // The function created after outlining. + Function *OutlinedFn = nullptr; + // Output relocation information. + DenseMap> OutputTypeCount; + DenseMap> OutputGepIdx; + OutputAllocaCache OAC; + // Set of attributes that are not to be inherited from parent functions. + AttrBuilder NonInheritAttrs; + // If we are emitting profile date during outlining. + bool EmitProfileData; + // The index of the initial occurrence for the current splice. + unsigned InitialStartIdx; +}; + +/// \brief Perform analysis and verification for the found outline candidates. +struct OutlinerAnalysisImpl { + OutlinerAnalysisImpl( + IRInstructionMapper &OM, std::vector &CandidateList, + function_ref GetTTI, + const DataLayout *Layout, + SpecificBumpPtrAllocator &DataAlloc) + : OM(OM), CandidateList(CandidateList), GetTTI(GetTTI), Layout(Layout), + DataAlloc(DataAlloc) {} + + // Analyze our found candidates for benefit and correctness. + void analyzeCandidateList() { + // Generate instruction info for each instruction that is part + // of an occurrence. + BitVector OccurInstrInfo(OM.getNumMappedInstructions()); + for (OutlineCandidate &OC : CandidateList) { + OC.Data = new (DataAlloc.Allocate()) AdditionalCandidateData(); + for (unsigned Occur : OC) + OccurInstrInfo.set(Occur, Occur + OC.Len); + } + for (unsigned Idx : OccurInstrInfo.set_bits()) + OM.createInstrInfo(Idx); + + // Estimate the function type and benefit of each chain. + estimateFunctionType(0, CandidateList.size()); + + // After initial benefit analysis, we verify the occurrences of each + // candidate are compatible. + verifyOccurrenceCompatability(); + } + +private: + // Helper struct for verifying the compatibility of occurrences within a + // candidate. + struct VerifyInst : public std::pair> { + void reset(Function *F) { + first = F; + second.clear(); + } + bool compare(VerifyInst &R, + function_ref GetTTI) { + Function *LF = first, *RF = R.first; + if (LF != RF) { + if (!AttributeFuncs::areInlineCompatible(*LF, *RF)) + return false; + TargetTransformInfo &TTI = GetTTI(*LF); + if (!TTI.areInlineCompatible(LF, RF)) + return false; + } + return second == R.second; + } + }; + + // Verify the compatibilities of the occurrences in each candidate. + void verifyOccurrenceCompatability() { + // Vector of different verification leaders. + std::vector VerificationCands; + // Count of occurrences to an verification instance. + std::vector Counts; + // Maps occurrences to the input index they belong to. + std::vector OccurrenceInputIdx; + // Verification instance of the current occurrence being considered. + VerifyInst CurrentOccurVerifyInst; + + size_t CurrentCandidateListSize = CandidateList.size(); + for (size_t i = 0, e = CurrentCandidateListSize; i < e; ++i) { + OutlineCandidate &OC = CandidateList[i]; + if (!OC.isValid()) + continue; + // Compute the input sequence for this occurrence. + for (size_t Oi = OccurrenceInputIdx.size(), Oe = OC.size(); Oi < Oe; + ++Oi) { + unsigned Occur = OC.getOccurrence(Oi); + CurrentOccurVerifyInst.reset(OM.getInstrFunction(Occur)); + for (size_t InstrIdx = Occur, InstrE = Occur + OC.Len; + InstrIdx < InstrE; ++InstrIdx) { + IRInstructionMapper::InstructionInfo &II = OM.getInstrInfo(InstrIdx); + for (unsigned InIdx : II.InputIndexes) { + // We only really need to verify inputs coming from within the + // sequence. Other inputs simply help to verify the ordering. + unsigned InputIdx = InIdx < Occur ? -1 : InIdx - Occur; + CurrentOccurVerifyInst.second.push_back(InputIdx); + } + } + + // Check for existing mapping of this instance. + auto It = + std::find_if(VerificationCands.begin(), VerificationCands.end(), + [&](VerifyInst &L) { + return L.compare(CurrentOccurVerifyInst, GetTTI); + }); + if (It != VerificationCands.end()) { + size_t InternalInputIt = It - VerificationCands.begin(); + ++Counts[InternalInputIt]; + OccurrenceInputIdx.push_back(InternalInputIt); + } else { + unsigned OrigCapacity = CurrentOccurVerifyInst.second.capacity(); + OccurrenceInputIdx.push_back(VerificationCands.size()); + VerificationCands.emplace_back(std::move(CurrentOccurVerifyInst)); + CurrentOccurVerifyInst.second.reserve(OrigCapacity); + Counts.push_back(1); + } + } + + OutlineCandidate &Cur = CandidateList[i]; + size_t SharedSizeWithNext = Cur.SharedSizeWithNext; + size_t IdxOfNext = i + 1; + while (SharedSizeWithNext > 0 && !CandidateList[IdxOfNext].isValid()) + SharedSizeWithNext = CandidateList[IdxOfNext++].SharedSizeWithNext; + size_t FirstOccur = Cur.getOccurrence(0); + + // Only split if needed. + if (Counts.size() > 1) + splitOutlineChain(i, Counts, OccurrenceInputIdx); + + // If we share a size with the next chain then we do cleanup and set up to + // reduce the amount of work we need to do during the next iteration. + if (SharedSizeWithNext > 0 && CandidateList[IdxOfNext].isValid()) { + // Get the cut off point for moving to the next candidate. + size_t SharedCutOffPoint = 0; + for (size_t InstrIdx = FirstOccur, + InstrE = FirstOccur + SharedSizeWithNext; + InstrIdx < InstrE; ++InstrIdx) { + IRInstructionMapper::InstructionInfo &II = OM.getInstrInfo(InstrIdx); + SharedCutOffPoint += II.InputIndexes.size(); + } + + // Update the size of the internal inputs vectors. + for (size_t InIt = 0, InE = VerificationCands.size(); InIt < InE; + ++InIt) + VerificationCands[InIt].second.resize(SharedCutOffPoint); + + // Don't bother merging if there is only one instance. + if (Counts.size() == 1) + continue; + + // Set resolved occurrences that point to the first vector. + BitVector UnResolved(OccurrenceInputIdx.size(), true); + for (size_t i = 0, e = OccurrenceInputIdx.size(); i < e; ++i) + if (OccurrenceInputIdx[i] == 0) + UnResolved.reset(i); + // Condense the internal inputs vector in the case where two vectors are + // now equivalent given the smaller size. + size_t InsertIdx = 1; + for (size_t i = 1, e = VerificationCands.size(); i < e; ++i) { + VerifyInst &VI = VerificationCands[i]; + auto RemapOccurIdxs = [&](size_t Old, size_t New) { + for (size_t OccurIdx : UnResolved.set_bits()) { + if (OccurrenceInputIdx[OccurIdx] == Old) { + OccurrenceInputIdx[OccurIdx] = New; + UnResolved.reset(OccurIdx); + } + } + }; + // Try to remap to an existing instance. + auto Remap = [&]() -> bool { + for (size_t j = 0; j < InsertIdx; ++j) { + if (VerificationCands[j].compare(VI, GetTTI)) { + Counts[j] += Counts[i]; + RemapOccurIdxs(i, j); + return true; + } + } + return false; + }; + + // Update mapping with new instance. + if (!Remap()) { + if (i != InsertIdx) { + Counts[InsertIdx] = Counts[i]; + RemapOccurIdxs(i, InsertIdx); + VerificationCands[InsertIdx++] = std::move(VerificationCands[i]); + } else + ++InsertIdx; + } + } + VerificationCands.resize(InsertIdx); + Counts.resize(InsertIdx); + continue; + } + // Otherwise we just cleanup what we used. + VerificationCands.clear(); + Counts.clear(); + OccurrenceInputIdx.clear(); + } + estimateFunctionType(CurrentCandidateListSize, CandidateList.size()); + } + + // Split the OutlineChain at index /p CurrentChainIndex into N different + // chains with the provided memberships. + void splitOutlineChain(size_t CurrentChainIndex, + const std::vector &MembershipCounts, + const std::vector &OccurrenceInputIdx) { + OutlineCandidate *OrigChain = &CandidateList[CurrentChainIndex]; + AdditionalCandidateData &OrigData = + OrigChain->getData(); + SmallVector SplitChains(MembershipCounts.size(), -1); + size_t FirstValid = 0; + while (FirstValid < MembershipCounts.size() && + MembershipCounts[FirstValid] < MinOccurrences) + ++FirstValid; + if (FirstValid == MembershipCounts.size()) { + OrigChain->invalidate(); + return; + } + + SplitChains[FirstValid] = CurrentChainIndex; + + // Add new chains for each valid count after the first. + unsigned OrigLen = OrigChain->Len; + for (size_t i = FirstValid + 1, e = MembershipCounts.size(); i < e; ++i) { + size_t Count = MembershipCounts[i]; + if (Count < MinOccurrences) + continue; + SplitChains[i] = CandidateList.size(); + CandidateList.emplace_back(OrigLen); + OutlineCandidate &NewChain = CandidateList.back(); + NewChain.Occurrences.reserve(Count); + NewChain.Data = new (DataAlloc.Allocate()) AdditionalCandidateData(); + AdditionalCandidateData &NCData = + NewChain.getData(); + NCData.InputSeq.reserve(OrigData.InputSeq.capacity()); + } + + // Move occurrences to their new parents. + OrigChain = &CandidateList[CurrentChainIndex]; + for (size_t i = 0, e = OrigChain->size(); i < e; ++i) { + size_t NewParentIdx = SplitChains[OccurrenceInputIdx[i]]; + if (NewParentIdx != CurrentChainIndex && NewParentIdx != size_t(-1)) { + OutlineCandidate &NewParent = CandidateList[NewParentIdx]; + NewParent.Occurrences.push_back(OrigChain->Occurrences[i]); + } + } + bool RecomputeInputs = FirstValid != 0; + for (ssize_t i = OrigChain->size() - 1; i >= 0; --i) + if (OccurrenceInputIdx[i] != FirstValid) + OrigChain->removeOccurrence(i); + + // Update the occurrences and recalculate the analysis for the split + // candidate. + computeInputOutputSequence(*OrigChain, RecomputeInputs); + estimateFunctionType(CurrentChainIndex, CurrentChainIndex + 1); + } + + // Estimate the function type of each outline chain in the given range. + void estimateFunctionType(size_t StartIdx, size_t EndIdx) { + std::vector InputOperands; + BitVector UnFoldableInputs; + for (size_t i = StartIdx; i < EndIdx; ++i) { + OutlineCandidate &OC = CandidateList[i]; + AdditionalCandidateData &OCData = OC.getData(); + unsigned FirstOccur = *OC.begin(); + + // Compute the input sequence if needed. + if (OCData.InputSeq.empty()) { + computeInputOutputSequence(OC); + + // After computing we also compute the input sequences of chains + // that we share size with. We have already calculated the input + // sequence for this chain as it is a subset of our current + // sequence. + OutlineCandidate *Cur = &OC; + while (Cur->SharedSizeWithNext > 0) { + OutlineCandidate *Next = Cur + 1; + AdditionalCandidateData &CurData = + Cur->getData(); + size_t NewSize = 0; + for (size_t e = CurData.InputSeq.size(); NewSize < e; ++NewSize) + if (CurData.InputSeq[NewSize].first >= Cur->SharedSizeWithNext) + break; + AdditionalCandidateData &NextData = + Next->getData(); + NextData.InputSeq.assign(CurData.InputSeq.begin(), + CurData.InputSeq.begin() + NewSize); + computeInputOutputSequence(*Next, false); + Cur = Next; + } + } + + // Check to see if we share candidates with our predecessor. If we + // do then we can avoid rechecking candidates that we already have + // information for. + size_t SharedOccurrencesWithPrev = 1; + if (i > StartIdx) { + OutlineCandidate &Prev = CandidateList[i - 1]; + if (Prev.SharedSizeWithNext > 0) + SharedOccurrencesWithPrev = Prev.size(); + } + + // Only recompute the input operands if we didn't share a size with the + // previous chain. + if (SharedOccurrencesWithPrev == 1) { + // Get the operands for the first candidate. + InputOperands.clear(); + UnFoldableInputs.reset(); + UnFoldableInputs.resize(OCData.InputSeq.size()); + for (size_t i = 0, e = OCData.InputSeq.size(); i < e; ++i) { + auto &Seq = OCData.InputSeq[i]; + Value *Op = + OM.getInstr(FirstOccur + Seq.first)->getOperand(Seq.second); + InputOperands.push_back(Op); + if (isa(Op) || isa(Op)) + UnFoldableInputs.set(i); + } + } else { + InputOperands.resize(OCData.InputSeq.size(), nullptr); + UnFoldableInputs.resize(InputOperands.size()); + } + + // Check to see which inputs won't be folded. + for (size_t Ci = SharedOccurrencesWithPrev, Ce = OC.size(); Ci != Ce; + ++Ci) { + unsigned Occur = OC.getOccurrence(Ci); + size_t InputNo = 0; + for (auto &Seq : OCData.InputSeq) { + if (UnFoldableInputs.test(InputNo)) { + ++InputNo; + continue; + } + Value *InputOp = + OM.getInstr(Occur + Seq.first)->getOperand(Seq.second); + if (InputOp != InputOperands[InputNo]) + UnFoldableInputs.set(InputNo); + ++InputNo; + } + } + + // Remove all of the inputs that will be folded. + for (size_t i = 0, e = OCData.InputSeq.size(), InputNo = e - 1; i < e; + ++i, --InputNo) { + if (!UnFoldableInputs.test(InputNo)) + OCData.InputSeq.erase(OCData.InputSeq.begin() + InputNo); + } + } + computeOutlineBenefit(CandidateList, StartIdx, EndIdx); + } + + // Compute the external input sequence(Inst#+Op#) and outputs of a given + // chain. + void computeInputOutputSequence(OutlineCandidate &OC, + bool ComputeInputs = true) { + AdditionalCandidateData &Data = OC.getData(); + // Inputs are operands that come from outside of the chain range. + if (ComputeInputs) { + Data.InputSeq.clear(); + Data.InputSeq.reserve(OC.Len); + unsigned FirstOccur = OC.getOccurrence(0); + for (size_t InstrIdx = FirstOccur, InstrE = FirstOccur + OC.Len; + InstrIdx < InstrE; ++InstrIdx) { + IRInstructionMapper::InstructionInfo &II = OM.getInstrInfo(InstrIdx); + for (size_t i = 0, e = II.InputIndexes.size(); i < e; ++i) + if (II.InputIndexes[i] < FirstOccur) + Data.InputSeq.emplace_back(InstrIdx - FirstOccur, i); + } + } + + // Outputs are internal instructions that have uses outside of the chain + // range. + Data.Outputs.clear(); + for (unsigned Occur : OC) { + for (size_t InstrIdx = Occur, InstrE = Occur + OC.Len; InstrIdx < InstrE; + ++InstrIdx) { + IRInstructionMapper::InstructionInfo &II = OM.getInstrInfo(InstrIdx); + if (II.FarthestInSameBlockOutput >= InstrE) + Data.Outputs.set(InstrIdx - Occur); + } + } + } + + // Computes the estimated benefit of a set of potential functions. + void computeOutlineBenefit(std::vector &CandidateList, + size_t StartIdx, size_t EndIdx) { + SmallDenseSet UniqueInputOperands; + SmallPtrSet OutputTypes; + for (size_t i = StartIdx; i < EndIdx; ++i) { + OutlineCandidate &OC = CandidateList[i]; + AdditionalCandidateData &Data = OC.getData(); + + /// Reset benefit metrics. + OC.invalidate(); + + // Sanity check. + unsigned NumOccurences = OC.size(); + if (NumOccurences < MinOccurrences) + continue; + + // Estimate the cost of this chain of instructions. + /// For now we just use the first candidate. + unsigned FirstOccur = OC.getOccurrence(0); + Function *FirstOccurFn = OM.getInstrFunction(FirstOccur); + TargetTransformInfo &TTI = GetTTI(*FirstOccurFn); + unsigned ChainCost = 0; + for (size_t i = 0, e = OC.Len, InstIdx = FirstOccur; i < e; + ++i, ++InstIdx) + ChainCost += OM.getInstrCost(TTI, InstIdx); + + unsigned WidestRegister = TTI.getRegisterBitWidth(false); + auto EstimateIrregularTypeParamCost = [&](Type *Ty) { + return Layout->getTypeSizeInBits(Ty) / WidestRegister; + }; + + /// Estimate inputs to this function. + UniqueInputOperands.clear(); + for (auto &Seq : Data.InputSeq) { + Value *IOp = + OM.getInstr(FirstOccur + Seq.first)->getOperand(Seq.second); + UniqueInputOperands.insert(IOp); + } + + unsigned NumFunctionParams = UniqueInputOperands.size(); + unsigned NumOutputs = Data.Outputs.count(); + /// We penalize container parameters. + for (Value *Op : UniqueInputOperands) { + Type *OpTy = Op->getType(); + if (OpTy->isVectorTy() || OpTy->isStructTy()) + NumFunctionParams += EstimateIrregularTypeParamCost(OpTy); + } + + // Compute the output to fold to a return as well as the total number of + // outputs passed to the function. + if (NumOutputs == 1) { + Data.OutputToFold = Data.Outputs.find_first(); + } else if (NumOutputs > 0) { + // Outputs are condensed by type, so the number of actual parameters + // going to the outlined function is the number of distinct types + // for the outputs. We try to reduce this by folding an output that + // is the single member of its type. + OutputTypes.clear(); + SmallDenseMap BeneficialToFold; + for (size_t OutputIdx : Data.Outputs) { + Type *ParamEleTy = OM.getInstr(FirstOccur + OutputIdx)->getType(); + if (OutputTypes.insert(ParamEleTy).second) + BeneficialToFold.try_emplace(ParamEleTy, OutputIdx); + else + BeneficialToFold.erase(ParamEleTy); + } + NumFunctionParams += OutputTypes.size(); + if (!BeneficialToFold.empty()) { + Data.OutputToFold = 0; + for (auto &TyPair : BeneficialToFold) + if (Data.OutputToFold < TyPair.second) + Data.OutputToFold = TyPair.second; + --NumFunctionParams; + } else + Data.OutputToFold = Data.Outputs.find_last(); + } else + Data.OutputToFold = -1; + + /// The new function contains one instance of the chain of instructions, + /// has to prepare each parameter, and contains a return. + unsigned NewFunctionCost = + std::min(NumFunctionParams, TTI.getNumberOfRegisters(false)) + + ChainCost + 1; + unsigned CostFromReLoad = 0; + + /// Add the cost for each output. + OutputTypes.clear(); + for (size_t OutputIdx : Data.Outputs) { + Type *ParamEleTy = OM.getInstr(FirstOccur + OutputIdx)->getType(); + Type *ParamTy = ParamEleTy->getPointerTo(); + + // Extremely basic cost estimate for loading and storing a struct type. + // FIXME: This needs to be improved. + if (ParamEleTy->isStructTy()) { + unsigned EstCost = EstimateIrregularTypeParamCost(ParamEleTy); + NewFunctionCost += EstCost; + CostFromReLoad += EstCost; + continue; + } + size_t StoreSize = Layout->getTypeStoreSize(ParamTy); + size_t AddrSpace = ParamTy->getPointerAddressSpace(); + + /// There will be a store into this variable in the outlined function. + NewFunctionCost += TTI.getMemoryOpCost(Instruction::Store, ParamEleTy, + StoreSize, AddrSpace); + /// Account for a potential add to the base output pointer. + if (OutputIdx != Data.OutputToFold) + if (!OutputTypes.insert(ParamEleTy).second) + NewFunctionCost += 1; + + /// Each output value has a reload in the parent function. + /// NOTE: This isn't entirely true if a specific instance doesn't use + /// the value. + CostFromReLoad += TTI.getMemoryOpCost(Instruction::Load, ParamEleTy, + StoreSize, AddrSpace); + } + + /// A call is generated at each occurence. + /// = call instruction + prepare each parameter + reload outputs. + unsigned CostPerOccurence = 1 + NumFunctionParams + CostFromReLoad; + + // No possibility of benefit. + if (CostPerOccurence >= ChainCost) + continue; + + // Compute the benefit of the chain and each occurrence. + OC.BenefitPerOccur = ChainCost - CostPerOccurence; + unsigned OutlineBenefit = OC.BenefitPerOccur * NumOccurences; + if (OutlineBenefit > NewFunctionCost) { + OC.Benefit = OutlineBenefit - NewFunctionCost; + if(OC.Benefit < MinBenefit) + OC.invalidate(); + } + + DEBUG(dbgs() << "Num : " << NumOccurences << "; Len : " << OC.Len + << "\n"); + DEBUG(dbgs() << "Inputs : " << Data.InputSeq.size() + << "; Outputs : " << NumOutputs << "\n"); + DEBUG(dbgs() << "Chain Cost : " << ChainCost << "\n"); + DEBUG(dbgs() << "CostPerOccur : " << CostPerOccurence << "\n"); + DEBUG(dbgs() << "BenefitPerOccur : " << OC.BenefitPerOccur << "\n"); + DEBUG(dbgs() << "NewFunctionCost : " << NewFunctionCost << "\n\n"); + } + } + + // Internal members + IRInstructionMapper &OM; + std::vector &CandidateList; + function_ref GetTTI; + const DataLayout *Layout; + SpecificBumpPtrAllocator &DataAlloc; +}; + +/// \brief Outliner for sequenced IR instructions. +class SequencedIROutliner : public SequencedOutlinerImpl { +public: + SequencedIROutliner(OutlinerMapper &OM, bool EmitProfileData, + function_ref GetTTI, + Module &M) + : SequencedOutlinerImpl(OM, ::MinInstructionLength, ::MinOccurrences, + SequencedOutlinerImpl::CSM_SuffixArray), + GetTTI(GetTTI), DL(&M.getDataLayout()), FS(EmitProfileData), + EmitRemarks(areRemarksEnabled(M)) {} + virtual ~SequencedIROutliner() {} + virtual void + analyzeCandidateList(std::vector &CL) override { + IRInstructionMapper &MIM = getMapperAs(); + OutlinerAnalysisImpl OAI(MIM, CL, GetTTI, DL, DataAlloc); + OAI.analyzeCandidateList(); + } + virtual void outlineCandidate(OutlineCandidate &OC, size_t CandNum) override { + IRInstructionMapper &IM = getMapperAs(); + FS.prepareForNewInstance(OC, IM); + for (unsigned i = 0, e = OC.size(); i < e; ++i) { + unsigned Occur = OC.getOccurrence(i); + if (EmitRemarks) + emitOptRemark(OC, CandNum, i, Occur); + FS.outlineOccurrence(OC, Occur, IM); + } + FS.finalize(OC); + +#ifndef NDEBUG + Function *OF = IM.getInstrFunction(OC.getOccurrence(0)); + DEBUG(dbgs() << "** Outlining : " << OF->getName() << "\n" + << " occurrences : " << OC.size() << "\n" + << " size : " << OC.Len << "\n" + << " benefit : " << OC.Benefit << "\n"); +#endif + ++NumCandidatesOutlined; + } + + // \brief Emit a remark about a code block that was outlined. + void emitOptRemark(OutlineCandidate &OC, unsigned CandidateIdx, + unsigned ValidOccurNum, unsigned Occur) { + IRInstructionMapper &IM = getMapperAs(); + Instruction *Head = IM.getInstr(Occur); + Instruction *Tail = IM.getInstr(Occur + OC.Len - 1); + Function *ParentFn = Head->getFunction(); + OptimizationRemarkEmitter ORE(ParentFn); + + std::ostringstream NameStream, AdditionalInfo; + NameStream << "Outlining Candidate " << CandidateIdx << ": Occurrence " + << ValidOccurNum; + AdditionalInfo << ": " << OC.Len << " instructions : " + << " Estimated benefit=" << OC.Benefit; + + std::string Msg = NameStream.str(); + std::string Additional = AdditionalInfo.str(); + if (Tail->getDebugLoc() != Head->getDebugLoc()) { + ore::NV BeginLoc("Begin location", Head); + BeginLoc.Val = "Begin location"; + ore::NV TailLoc("End location", Tail); + TailLoc.Val = "End location"; + ORE.emit(OptimizationRemark(DEBUG_TYPE, "Outlined", Head) + << Msg << " Begin " << Additional << " : " << TailLoc); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "Outlined", Tail) + << Msg << " End : " << BeginLoc); + } else + ORE.emit(OptimizationRemark(DEBUG_TYPE, "Outlined", Head) + << Msg << Additional); + } + + // Emitting optimization remarks creates a compile hit due to the string + // manipulation. Check if its enabled so that we can avoid it. + bool areRemarksEnabled(Module &M) { + bool IsEnabled = OptimizationRemark::isEnabled(DEBUG_TYPE); + if (!IsEnabled && !M.empty()) { + OptimizationRemarkEmitter ORE(&M.getFunctionList().front()); + IsEnabled = ORE.allowExtraAnalysis(); + } + return IsEnabled; + } + SpecificBumpPtrAllocator DataAlloc; + function_ref GetTTI; + const DataLayout *DL; + FunctionSplicer FS; + bool EmitRemarks; +}; + +// Run the outliner over the provided module /p M. +bool runImpl(Module &M, ProfileSummaryInfo *PSI, + function_ref GetBFI, + function_ref GetTTI) { + if (!EnableOutliner) + return false; + IRInstructionMapper OM; + OM.mapModule(M, PSI, GetBFI); + + // No potential candidates. + if (OM.getNumMappedInstructions() < MinOccurrences * MinInstructionLength) + return false; + + // Logic for outlining an individual candidate. + bool EmitProfileData = PSI->hasProfileSummary(); + SequencedIROutliner SOI(OM, EmitProfileData, GetTTI, M); + return SOI.run(); +} +} // namespace + +PreservedAnalyses CodeSizeOutlinerPass::run(Module &M, + ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + std::function GetTTI = + [&](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + std::function GetBFI = + [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; + ProfileSummaryInfo *PSI = &AM.getResult(M); + return runImpl(M, PSI, GetBFI, GetTTI) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +struct CodeSizeOutlinerLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + CodeSizeOutlinerLegacyPass() : ModulePass(ID) { + initializeCodeSizeOutlinerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + TargetTransformInfoWrapperPass *TTIWP = + &getAnalysis(); + DenseMap TTIMap; + std::function GetTTI = + [&](Function &F) -> TargetTransformInfo & { + auto TTIIt = TTIMap.find(&F); + if (TTIIt == TTIMap.end()) + TTIIt = TTIMap.try_emplace(&F, std::move(TTIWP->getTTI(F))).first; + return TTIIt->second; + }; + std::unique_ptr CurBFI; + std::function GetBFI = + [&](Function &F) -> BlockFrequencyInfo & { + DominatorTree DT(F); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(F, LI); + CurBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); + return *CurBFI.get(); + }; + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + return runImpl(M, PSI, GetBFI, GetTTI); + } +}; +char CodeSizeOutlinerLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CodeSizeOutlinerLegacyPass, DEBUG_TYPE, + "Code Size Outliner", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(CodeSizeOutlinerLegacyPass, DEBUG_TYPE, + "Code Size Outliner", false, false) +ModulePass *llvm::createCodeSizeOutlinerPass() { + return new CodeSizeOutlinerLegacyPass(); +} Index: lib/Transforms/IPO/IPO.cpp =================================================================== --- lib/Transforms/IPO/IPO.cpp +++ lib/Transforms/IPO/IPO.cpp @@ -25,6 +25,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeArgPromotionPass(Registry); + initializeCodeSizeOutlinerLegacyPassPass(Registry); initializeConstantMergeLegacyPassPass(Registry); initializeCrossDSOCFIPass(Registry); initializeDAEPass(Registry); Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -155,6 +155,10 @@ "enable-gvn-sink", cl::init(false), cl::Hidden, cl::desc("Enable the GVN sinking pass (default = off)")); +static cl::opt EnableEarlyCSO( + "enable-early-cso", cl::init(false), cl::Hidden, + cl::desc("Enable an early run of the code size outliner pass (default = off)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -490,6 +494,10 @@ if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); + // Add an early run of the code size outliner pass. + if (EnableEarlyCSO && SizeLevel > 0) + MPM.add(createCodeSizeOutlinerPass()); + if (EnableNonLTOGlobalsModRef) // We add a module alias analysis pass here. In part due to bugs in the // analysis infrastructure this "works" in that the analysis stays alive @@ -660,6 +668,10 @@ MPM.add(createGlobalDCEPass()); // Remove dead fns and globals. MPM.add(createConstantMergePass()); // Merge dup global constants } + + // Add a late run of the code size outliner pass. + if(SizeLevel > 0) + MPM.add(createCodeSizeOutlinerPass()); } if (MergeFunctions) @@ -823,6 +835,10 @@ void PassManagerBuilder::addLateLTOOptimizationPasses( legacy::PassManagerBase &PM) { + // Add a late run of the code size outliner pass. + if(SizeLevel > 0) + PM.add(createCodeSizeOutlinerPass()); + // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass());