Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -100,6 +100,7 @@ void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); +void initializeCodeSizeOutlinerLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstantPropagationPass(PassRegistry&); Index: include/llvm/Transforms/IPO.h =================================================================== --- include/llvm/Transforms/IPO.h +++ include/llvm/Transforms/IPO.h @@ -220,6 +220,12 @@ /// indicating the set of functions they may target at run-time. ModulePass *createCalledValuePropagationPass(); +//===----------------------------------------------------------------------===// +/// createCodeSizeOutlinerPass - This pass outlines congruent parts of +/// functions. +/// +ModulePass *createCodeSizeOutlinerPass(); + /// What to do with the summary when running passes that operate on it. enum class PassSummaryAction { None, ///< Do nothing. Index: include/llvm/Transforms/IPO/CodeSizeOutliner.h =================================================================== --- /dev/null +++ include/llvm/Transforms/IPO/CodeSizeOutliner.h @@ -0,0 +1,31 @@ +//===-- CodeSizeOutliner.h - DCE unreachable internal functions +//------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This transform outlines congruent chains of instructions from the current +// module. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H +#define LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H + +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { +/// Pass to remove unused function declarations. +class CodeSizeOutlinerPass : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_CODESIZEOUTLINER_H Index: include/llvm/Transforms/Utils/Outliner.h =================================================================== --- /dev/null +++ include/llvm/Transforms/Utils/Outliner.h @@ -0,0 +1,125 @@ +//===- Outliner.h - A generic outlining utility interface around the Utils lib +//------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file includes utilities for defining outliner functionality. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_OUTLINER_H +#define LLVM_TRANSFORMS_UTILS_OUTLINER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/IR/Instruction.h" +#include + +namespace llvm { +class BitVector; +class BlockFrequencyInfo; +class ProfileSummaryInfo; + +/// \brief A potential candidate to be outlined. +struct OutlineCandidate { + /// A unique ID for this outline candidate. + unsigned ID; + + /// The amount of instructions being saved. + unsigned Len; + + /// The computed benefit of outlining this candidate. + unsigned Benefit = 0; + + /// The estimated benefit we receive per occurrence. + unsigned BenefitPerOccur = 0; + + /// Instruction size that this candidate shares with the next. + unsigned SharedSizeWithNext = 0; + + /// Identifier for each occurrence. + std::vector Occurrences; + + // Accessors. + using Iterator = std::vector::iterator; + using ConstIterator = std::vector::const_iterator; + Iterator begin() { return Occurrences.begin(); } + Iterator end() { return Occurrences.end(); } + ConstIterator begin() const { return Occurrences.begin(); } + ConstIterator end() const { return Occurrences.end(); } + size_t size() const { return Occurrences.size(); } + + // Check to see if this chain is still profitable to outline. + bool isValid() const { return Benefit != 0; } + // Set this candidate as not profitable. + void invalidate() { Benefit = 0; } + // Get the candidate at index /p Idx. + unsigned getOccurrence(size_t Idx) const { + assert(Idx < size() && "Invalid occurrence index."); + return Occurrences[Idx]; + } + // Remove the occurrence at index /p Idx + void removeOccurrence(size_t Idx) { + Occurrences[Idx] = Occurrences.back(); + Occurrences.pop_back(); + } + + OutlineCandidate(unsigned ID, unsigned Len, + std::vector &Occurrences) + : ID(ID), Len(Len), Occurrences(Occurrences) {} + OutlineCandidate(unsigned ID, unsigned Len) : ID(ID), Len(Len) {} +}; + +std::vector findSequentialOutliningCandidates( + function_ref, unsigned)> PrePruneFn, + std::vector &Vec, unsigned MinInstructionLen, + unsigned MinOccurrences); + +bool pruneSequentialOutlineCandidateList(MutableArrayRef CL, + unsigned NumMappedInstructions); + +/// \brief Helper struct containing mapping information for a module. +class OutlinerMapper { +public: + // Get the instruction at index /p Idx. + template InstrTy *getInstr(unsigned Idx) { + return (InstrTy *)InstrVec[Idx]; + } + + // Map a new instruction. + void mapInstr(void *I) { + if (I) + InstructionToIdxMap.try_emplace(I, InstrVec.size()); + InstrVec.push_back(I); + } + + // Get the index of /p I inside of the internal vector. + unsigned getInstrIdx(void *I) { + auto It = InstructionToIdxMap.find(I); + return LLVM_UNLIKELY(It == InstructionToIdxMap.end()) ? -1 : It->second; + } + + // Get the number of mapped instructions. + unsigned getNumMappedInstructions() const { return InstrVec.size(); } + +private: + /// Stores location of instructions mapped to the corresponding index in + /// the CCVec. + std::vector InstrVec; + + /// Map + DenseMap InstructionToIdxMap; +}; + +// Map the module /p M to prepare for outlining. +std::vector +mapIRModule(OutlinerMapper &OM, Module &M, ProfileSummaryInfo *PSI, + function_ref GetBFI); + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_UTILS_OUTLINER_H Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -66,6 +66,7 @@ #include "llvm/Transforms/IPO/AlwaysInliner.h" #include "llvm/Transforms/IPO/ArgumentPromotion.h" #include "llvm/Transforms/IPO/CalledValuePropagation.h" +#include "llvm/Transforms/IPO/CodeSizeOutliner.h" #include "llvm/Transforms/IPO/ConstantMerge.h" #include "llvm/Transforms/IPO/CrossDSOCFI.h" #include "llvm/Transforms/IPO/DeadArgumentElimination.h" @@ -193,6 +194,10 @@ cl::desc("Run synthetic function entry count generation " "pass")); +static cl::opt EnableEarlyCSO( + "enable-npm-early-cso", cl::init(false), cl::Hidden, + cl::desc("Enable an early run of the code size outlining pass for the new PM (default = off)")); + static Regex DefaultAliasRegex( "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); @@ -666,6 +671,10 @@ if (EnableSyntheticCounts && !PGOOpt) MPM.addPass(SyntheticCountsPropagation()); + // Add an early code size outlining pass. + if (EnableEarlyCSO && isOptimizingForSize(Level)) + MPM.addPass(CodeSizeOutlinerPass()); + // Require the GlobalsAA analysis for the module so we can query it within // the CGSCC pipeline. MPM.addPass(RequireAnalysisPass()); @@ -867,6 +876,10 @@ MPM.addPass(CGProfilePass()); + // Add the late outlining pass. + if(isOptimizingForSize(Level)) + MPM.addPass(CodeSizeOutlinerPass()); + // Now we need to do some global optimization transforms. // FIXME: It would seem like these should come first in the optimization // pipeline and maybe be the bottom of the canonicalization pipeline? Weird Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -43,6 +43,7 @@ MODULE_PASS("called-value-propagation", CalledValuePropagationPass()) MODULE_PASS("cg-profile", CGProfilePass()) MODULE_PASS("constmerge", ConstantMergePass()) +MODULE_PASS("codesizeoutliner", CodeSizeOutlinerPass()) MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass()) MODULE_PASS("deadargelim", DeadArgumentEliminationPass()) MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass()) Index: lib/Transforms/IPO/CMakeLists.txt =================================================================== --- lib/Transforms/IPO/CMakeLists.txt +++ lib/Transforms/IPO/CMakeLists.txt @@ -5,6 +5,7 @@ BlockExtractor.cpp CalledValuePropagation.cpp ConstantMerge.cpp + CodeSizeOutliner.cpp CrossDSOCFI.cpp DeadArgumentElimination.cpp ElimAvailExtern.cpp Index: lib/Transforms/IPO/CodeSizeOutliner.cpp =================================================================== --- /dev/null +++ lib/Transforms/IPO/CodeSizeOutliner.cpp @@ -0,0 +1,2248 @@ +//===-- CodeSizeOutliner.cpp - Propagate constants through calls -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements a simple algorithm for outlining congruent chains of +// instructions from the current module. +// +//===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/CodeSizeOutliner.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/IntervalMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/Outliner.h" +#include + +#ifndef DEBUG +#define DEBUG(a) +#endif + +using namespace llvm; + +static cl::opt MinOccurrences( + "cso-min-occurrences", cl::init(2), cl::Hidden, + cl::desc( + "Min number of occurrences to consider a candidate for outlining.")); +static cl::opt MinInstructionLength( + "cso-min-instructions", cl::init(1), cl::Hidden, + cl::desc( + "Min number of instructions to consider a candidate for outlining.")); +static cl::opt + MinBenefit("cso-min-benefit", cl::init(1), cl::Hidden, + cl::desc("Min estimated benefit to be considered profitable.")); + +#define DEBUG_TYPE "codesizeoutliner" +STATISTIC(NumOccurrencesOutlined, "Number of occurrences outlined"); +STATISTIC(NumCandidatesOutlined, "Number of outlined functions created"); + +namespace { +/// \brief A struct representing an input into an outlined function. +struct Input { + Input(unsigned short InstrNo, unsigned short OpNo) + : InstrNo(InstrNo), OpNo(OpNo) {} + Input() {} + /// The index of the instruction in the sequence. + unsigned short InstrNo; + /// The index of the operand into the instruction. + unsigned short OpNo; +}; + +/// \brief Contains the information necessary for condensing constant inputs. +// Condense any constant int inputs to the outlined function. +// example: +// void outlinedFn(int a, int b); +// call 1: outlinedFn(1, 2); +// call 2: outlinedFn(3, 4); +// We identify that in all occurrences that arg 2 is simply +// a difference of 1 from arg 1. +// void outlinedFn(int a) { +// int b = a + 1; +// ... +// } +struct ConstantCondenseInstance { + /// The base input. + Input BaseInput; + + /// The difference from the base input. + std::vector Diffs; + + /// The input/op numbers for the condensed arguments. + std::vector InputSeqs; + + /// The cost of this instance. + unsigned Cost; + + ConstantCondenseInstance(Input BaseInput, unsigned Reserve) + : BaseInput(BaseInput) { + Diffs.reserve(Reserve); + InputSeqs.reserve(Reserve); + } +}; + +/// \brief Information necessary for outlining specific to IR outlining. +struct AdditionalCandidateData { + /// Inputs into this candidate : Vector. + std::vector InputSeq; + + /// Outputs from this candidate. + SparseBitVector<> Outputs; + + /// The constant condense instances. + std::vector ConstInsts; + + /// The number of registers used in a call to the candidate. + unsigned NumCallRegisters = 0; +}; + +/// \brief Holds information about a particular instruction. +struct InstructionInfo { + /// The inputs/operands going into this instruction. + SmallVector InputIndexes; + + /// The index of the farthest use of this instruction in the same block + /// parent. + unsigned FarthestInSameBlockOutput = 0; + + /// The size cost of this instruction. + unsigned Cost = -1; +}; +} // namespace + +namespace { + +// Helper function for getting the size in bits of a type +// in a multiple of /p WidestRegister. +unsigned getTypeSize(const DataLayout &DL, Type *Ty, unsigned WidestRegister) { + if (Ty->isTokenTy()) + return 0u; + if (Ty->isPointerTy()) + return WidestRegister; + unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType()); + unsigned RegisterRemainder = TypeSize % WidestRegister; + if (RegisterRemainder > 0) + TypeSize += WidestRegister - RegisterRemainder; + return TypeSize; +} + +// Helper function for getting the register usage of type /p Ty. +unsigned getRegisterUsage(const DataLayout &DL, Type *Ty, + unsigned WidestRegister) { + unsigned ret = 0; + // getTypeSize calls getTypeSizeInBits which expects that + // the type size is non zero. So check.. + if (Ty != nullptr && Ty->isSized()) + ret = getTypeSize(DL, Ty, WidestRegister) / WidestRegister; + return ret; +} + +// Helper to get the cost of a constant. +unsigned getGlobalValueCost(const Constant *C) { + if (isa(C)) + return TargetTransformInfo::TCC_Free; + if (isa(C)) + return TargetTransformInfo::TCC_Basic; + const ConstantExpr *CE = dyn_cast(C); + if (!CE) + return TargetTransformInfo::TCC_Free; + if (CE->getOpcode() == Instruction::GetElementPtr) { + unsigned Cost = TargetTransformInfo::TCC_Basic; + Cost += getGlobalValueCost(CE->getOperand(0)); + for (unsigned i = 1, e = CE->getNumOperands(); i < e; ++i) { + const Constant *COp = CE->getOperand(i); + if (const ConstantInt *CI = dyn_cast(COp)) { + if (!CI->isZero()) + ++Cost; + } else + ++Cost; + } + return Cost; + } + if (CE->isCast()) + return getGlobalValueCost(CE->getOperand(0)); + return TargetTransformInfo::TCC_Free; +} + +/// \brief Helper struct containing mapping information for a module. +class IROutlinerMapper : public OutlinerMapper { +public: + /// + /// Instruction Information Utilities. + /// + + // Get the instruction at index /p Idx. + Instruction *getInstr(unsigned Idx) { + return OutlinerMapper::getInstr(Idx); + } + // Get the operand /p OpIdx of the instruction at index /p Idx. + Value *getInstrOp(unsigned Idx, unsigned OpIdx) { + return getInstr(Idx)->getOperand(OpIdx); + } + // Get the parent function of the instruction at /p Idx. + Function *getInstrFunction(size_t Idx) { + return getInstr(Idx)->getFunction(); + } + // Get or compute the cost of the instruction at /p InstrIdx. + unsigned getInstrCost(TargetTransformInfo &TTI, size_t InstrIdx) { + InstructionInfo *Info = InstrInfo[InstrIdx]; + if (Info->Cost == unsigned(-1)) { + Info->Cost = computeInstrCost(TTI, InstrIdx); +#ifndef NDEBUG + Instruction *I = getInstr(InstrIdx); + DEBUG(dbgs() << "Instruction Cost : " << Info->Cost << " : " << *I + << "\n"); +#endif + } + return Info->Cost; + } + // Get the instruction info attached to index /p InstrIdx + InstructionInfo &getInstrInfo(size_t InstrIdx) { + InstructionInfo *Info = InstrInfo[InstrIdx]; + assert(Info && "Queried instruction has no info created."); + return *Info; + } + // Create instruction info for the instruction at /p InstrIdx + void createInstrInfo(size_t InstrIdx) { + InstructionInfo *&Info = InstrInfo[InstrIdx]; + assert(!Info && "Instruction info already generated."); + Info = new (InfoAllocator.Allocate()) InstructionInfo(); + Instruction *Inst = getInstr(InstrIdx); + BasicBlock *InstPar = Inst->getParent(); + + /// Inputs. + unsigned NumOperands = Inst->getNumOperands(); + Info->InputIndexes.reserve(NumOperands); + for (unsigned InIt = 0; InIt < NumOperands; ++InIt) { + unsigned IIdx = 0; + Value *Op = Inst->getOperand(InIt); + Instruction *IOp = dyn_cast(Op); + if (IOp && IOp->getParent() == InstPar) { + unsigned FoundIIdx = getInstrIdx(IOp); + if (FoundIIdx <= InstrIdx) + IIdx = FoundIIdx; + } else if (isa(Op)) + break; + Info->InputIndexes.emplace_back(IIdx); + } + + /// Outputs. + for (User *Usr : Inst->users()) { + Instruction *I = dyn_cast(Usr); + if (!I || I->getParent() != InstPar) { + Info->FarthestInSameBlockOutput = -1; + break; + } + unsigned IIdx = getInstrIdx(I); + if (IIdx > Info->FarthestInSameBlockOutput) + Info->FarthestInSameBlockOutput = IIdx; + } + } + + /// + /// Candidate Information Utilities. + /// + + // Get the data attached to the provided candidate. + AdditionalCandidateData &getCandidateData(const OutlineCandidate &OC) { + return *OutlineCandData[OC.ID]; + } + // Create candidate data. + void createCandidateData(size_t Count) { + size_t NewSize = OutlineCandData.size() + Count; + OutlineCandData.reserve(NewSize); + for (size_t i = 0; i < Count; ++i) + OutlineCandData.emplace_back(new (DataAlloc.Allocate()) + AdditionalCandidateData()); + } + + void initAdditionalData() { + InstrInfo.assign(getNumMappedInstructions(), nullptr); + } + +private: + // Compute the cost of an instruction. + unsigned computeInstrCost(TargetTransformInfo &TTI, unsigned InstrIdx) { + Instruction *I = getInstr(InstrIdx); + const DataLayout &Layout = I->getModule()->getDataLayout(); + bool HasFreeAddressComp = TTI.getAddressComputationCost(I->getType()) == 0; + unsigned Cost = 0; + + // Estimate the cost of each instruction. + switch (I->getOpcode()) { + case Instruction::Add: + if (any_of(I->operand_values(), + [](Value *V) { return isa(V); })) + Cost = TargetTransformInfo::TCC_Free; + else + Cost = TargetTransformInfo::TCC_Basic; + break; + case Instruction::FDiv: + Cost = TargetTransformInfo::TCC_Basic; + break; + case Instruction::FRem: + case Instruction::SDiv: + case Instruction::SRem: + case Instruction::UDiv: + case Instruction::URem: + Cost = TargetTransformInfo::TCC_Basic * 2; + break; + case Instruction::Store: { + StoreInst *SI = cast(I); + Type *SITy = SI->getValueOperand()->getType(); + Cost = getRegisterUsage(Layout, SITy, TTI.getRegisterBitWidth(false)); + break; + } + case Instruction::Load: { + LoadInst *LI = cast(I); + Type *LITy = LI->getType(); + Cost = getRegisterUsage(Layout, LITy, TTI.getRegisterBitWidth(false)); + + // Be conservative about the cost of loads given they may be folded. + // Estimating a lower cost helps to prevent over estimating the + // benefit of this instruction. + Value *Ptr = LI->getPointerOperand(); + auto WillLoadFold = [&]() -> bool { + // We likely can't fold from a phi node. + if (isa(Ptr)) + return false; + // Floating point values will likely need to be materialized. + if (LITy->isFloatingPointTy() && !isa(Ptr)) + return false; + // If the address comp is free then any globals won't affect the cost. + if (HasFreeAddressComp) + return true; + // Non Free address comp will likely need to load globals or values + // used more than once. + return !isa(Ptr) && I->hasOneUse(); + }; + if (WillLoadFold()) + --Cost; + break; + } + case Instruction::Invoke: + case Instruction::Call: + Cost = computeCallCost(I, TTI); + break; + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = dyn_cast(I); + Cost = TTI.getUserCost(GEP); + + // Be conservative about non free global geps that appear more than once + // in a function, the gep is likely to be materialized only once. + GlobalVariable *GVar = dyn_cast(GEP->getPointerOperand()); + if (!GVar) + break; + if (HasFreeAddressComp) { + for (User *Usr : GVar->users()) { + Instruction *InstrUsr = dyn_cast(Usr); + if (!InstrUsr || InstrUsr == I) + continue; + if (InstrUsr->getFunction() != I->getFunction()) + continue; + Cost = TargetTransformInfo::TCC_Free; + break; + } + break; + } + break; + } + default: + Cost = TTI.getUserCost(I); + break; + } + + // Accumulate additional cost from constant and global operands. + for (auto OpI = I->op_begin(), OpE = I->op_end(); OpI != OpE; ++OpI) { + Constant *COp = dyn_cast(&*OpI); + if (!COp) + continue; + + // Get the code size cost of any constant int immediates. + if (ConstantInt *CI = dyn_cast(COp)) { + if (isa(I)) + continue; + Cost += TTI.getIntImmCodeSizeCost(I->getOpcode(), OpI->getOperandNo(), + CI->getValue(), CI->getType()); + } + // If we don't have free address computation any use of globals + // can actually result in more cost. + else if (!HasFreeAddressComp) + Cost += getGlobalValueCost(COp); + } + return Cost; + } + + // Helper for getting the cost of a call instruction. + unsigned computeCallCost(Instruction *I, TargetTransformInfo &TTI) { + CallSite CS(I); + Function *F = CS.getCalledFunction(); + + // For intrinsics and non lowered calls, use the default cost. + if (F && F->isDeclaration()) { + if (F->isIntrinsic()) + return TTI.getUserCost(I); + if (!TTI.isLoweredToCall(F)) + return TargetTransformInfo::TCC_Basic; + } + // Otherwise compute the register usage of the call instead of just + // a param count. + const DataLayout &Layout = CS.getParent()->getModule()->getDataLayout(); + unsigned WidestRegister = TTI.getRegisterBitWidth(false); + // Collect the total size of the parameters. + unsigned TotalParamSizeInBits = 0; + for (Value *Op : CS.args()) { + TotalParamSizeInBits += + getTypeSize(Layout, Op->getType(), WidestRegister); + } + // Potential reload for call return. + if (!I->use_empty()) { + Type *RetTy = CS.getType(); + TotalParamSizeInBits += getTypeSize(Layout, RetTy, WidestRegister); + } + // Call instruction. + unsigned CallCost = TargetTransformInfo::TCC_Basic; + // Setting up the function pointer to be called. + if (!F) + CallCost += TargetTransformInfo::TCC_Basic; + // Parameter/output register usage. + CallCost += TotalParamSizeInBits / WidestRegister; + return CallCost; + } + + /// Stores information for parallel instruction in InstrVec. + std::vector InstrInfo; + SpecificBumpPtrAllocator InfoAllocator; + + /// Stores extra candidate data. + std::vector OutlineCandData; + SpecificBumpPtrAllocator DataAlloc; +}; + +/// \brief A specific instance of an outlined candidate. +struct FunctionSplicer { + FunctionSplicer(bool EmitProfileData) + : EmitProfileData(EmitProfileData), NumOutlined(0) { + // Prepare the function attributes that we don't want to inherit from any + // parents. + NonInheritAttrs.addAttribute(Attribute::AlwaysInline); + NonInheritAttrs.addAttribute(Attribute::ArgMemOnly); + NonInheritAttrs.addAttribute(Attribute::InaccessibleMemOnly); + NonInheritAttrs.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); + NonInheritAttrs.addAttribute(Attribute::InlineHint); + NonInheritAttrs.addAttribute(Attribute::Naked); + NonInheritAttrs.addAttribute(Attribute::NoInline); + NonInheritAttrs.addAttribute(Attribute::NoRecurse); + NonInheritAttrs.addAttribute(Attribute::ReturnsTwice); + NonInheritAttrs.addAttribute(Attribute::ReadOnly); + NonInheritAttrs.addAttribute(Attribute::ReadNone); + NonInheritAttrs.addAttribute(Attribute::NoReturn); + NonInheritAttrs.addAttribute(Attribute::WriteOnly); + } + + // Reset the outliner to prepare for a new instance. + void prepareForNewCandidate(const OutlineCandidate &OC, + IROutlinerMapper &OM) { + OutlinedFn = nullptr; + InitialStartIdx = *OC.begin(); + AdditionalCandidateData &Data = OM.getCandidateData(OC); + + // Handle output type. + LLVMContext &Ctx = OM.getInstr(InitialStartIdx)->getContext(); + unsigned NumOutputs = Data.Outputs.count(); + if (NumOutputs == 0) + OutputType = Type::getVoidTy(Ctx); + else if (NumOutputs == 1) { + Instruction *OnlyOut = + OM.getInstr(InitialStartIdx + Data.Outputs.find_first()); + OutputType = OnlyOut->getType(); + } else { + // Collect information about the outputs of this new candidate. + SmallVector OutputTypes; + for (size_t OutputIdx : Data.Outputs) { + Type *OutTy = OM.getInstr(InitialStartIdx + OutputIdx)->getType(); + OutputTypes.push_back(OutTy); + } + + // Build a struct type to hold the outputs. + OutputType = StructType::get(Ctx, OutputTypes); + } + } + + // Outline a new occurrence of an outline chain. + void outlineOccurrence(const OutlineCandidate &OC, unsigned StartIdx, + IROutlinerMapper &OM) { + ++NumOccurrencesOutlined; + Instruction *Tail = OM.getInstr(StartIdx + (OC.Len - 1)); + Function *ParentFn = Tail->getFunction(); + bool InitialOccur = !OutlinedFn; + AdditionalCandidateData &Data = OM.getCandidateData(OC); + + /// Split the outline chain into its own block. + Instruction *Head = OM.getInstr(StartIdx); + BasicBlock *EntryBlock = Head->getParent(); + // Split our chain instance into a separate block for extraction. + /// Split up to the head if we aren't at the front of our block. + if (Head != &EntryBlock->front() || + EntryBlock == &ParentFn->getEntryBlock()) + EntryBlock = EntryBlock->splitBasicBlock(Head->getIterator()); + /// Split after the tail. + BasicBlock *Exit = nullptr; + if (!Tail->isTerminator()) { + auto SentinalIt = Tail->getNextNode()->getIterator(); + Exit = EntryBlock->splitBasicBlock(SentinalIt); + } + + // Create a new block to patch the outlined section. + BasicBlock *OutlineBlock = + BasicBlock::Create(ParentFn->getContext(), "cso.patch", ParentFn); + + // Create parameter vec for the new call. + unsigned NumOutputs = Data.Outputs.count(); + std::vector Args; + Args.reserve(Data.InputSeq.size()); + + // Build inputs/outputs in order. + for (Input &I : Data.InputSeq) + Args.push_back(OM.getInstrOp(StartIdx + I.InstrNo, I.OpNo)); + + // Replace branches to entry block. + EntryBlock->replaceAllUsesWith(OutlineBlock); + + // Get the first valid debug info. + DebugLoc CallLoc; + if (ParentFn->getSubprogram()) + for (auto It = Head->getIterator(), E = EntryBlock->end(); + !CallLoc && It != E; ++It) + CallLoc = It->getDebugLoc(); + + // If this is not the first occurrence we merge metadata and attributes for + // the outlined instructions. + if (!InitialOccur) { + unsigned KnownIDs[] = {LLVMContext::MD_tbaa, + LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, + LLVMContext::MD_range, + LLVMContext::MD_invariant_load, + LLVMContext::MD_nonnull, + LLVMContext::MD_invariant_group, + LLVMContext::MD_align, + LLVMContext::MD_dereferenceable, + LLVMContext::MD_fpmath, + LLVMContext::MD_dereferenceable_or_null}; + + // Merge special state. + for (unsigned InitI = InitialStartIdx, CurI = StartIdx, + InitE = InitialStartIdx + OC.Len; + InitI < InitE; ++InitI, ++CurI) { + Instruction *InitII = OM.getInstr(InitI); + Instruction *CurII = OM.getInstr(CurI); + // Make sure the alignment is valid as we skip it during congruency + // finding. + if (LoadInst *LI = dyn_cast(InitII)) + LI->setAlignment(std::min(LI->getAlignment(), + cast(CurII)->getAlignment())); + else if (StoreInst *SI = dyn_cast(InitII)) + SI->setAlignment(std::min(SI->getAlignment(), + cast(CurII)->getAlignment())); + // Make sure that no tails are propagated properly. + else if (CallInst *CI = dyn_cast(InitII)) { + auto TCK = cast(CurII)->getTailCallKind(); + if (TCK != CallInst::TCK_Tail) + CI->setTailCallKind(TCK); + } + // Be conservative about flags like nsw/nuw. + else + InitII->andIRFlags(OM.getInstr(CurI)); + + // Merge metadata. + DebugLoc DL = InitII->getDebugLoc(); + if (DL && DL != CurII->getDebugLoc()) + InitII->setDebugLoc(DebugLoc()); + combineMetadata(InitII, CurII, KnownIDs, true); + } + + // Merge function attribute data. + AttributeFuncs::mergeAttributesForInlining(*OutlinedFn, *ParentFn); + + // Make sure that all of the parameters are the correct type. + FunctionType *FTy = OutlinedFn->getFunctionType(); + for (unsigned i = 0, e = Args.size(); i < e; ++i) { + Value *&Arg = Args[i]; + Type *ParamTy = FTy->getParamType(i); + if (ParamTy != Arg->getType()) + Arg = CastInst::Create(CastInst::BitCast, Arg, ParamTy, "", + OutlineBlock); + } + + // Otherwise we outline the initial occurrence. + } else + outlineInitialOccurrence(OC, OM, StartIdx, Args, EntryBlock); + + // Create the patchup for the outline section. + Instruction *PatchupI; + if (InvokeInst *II = dyn_cast(Tail)) { + // If we have an invoke then we create a reload block for the values. + InvokeInst *NewII = + InvokeInst::Create(OutlinedFn, II->getNormalDest(), + II->getUnwindDest(), Args, "", OutlineBlock); + NewII->setCallingConv(OutlinedFn->getCallingConv()); + NewII->setDebugLoc(CallLoc); + PatchupI = NewII; + // Otherwise create a call instruction. + } else { + CallInst *CI = CallInst::Create(OutlinedFn, Args, "", OutlineBlock); + CI->setCallingConv(OutlinedFn->getCallingConv()); + CI->setDebugLoc(CallLoc); + PatchupI = CI; + } + + // Replace uses of outputs and create reloads. + if (NumOutputs == 1) { + Instruction *OnlyOut = OM.getInstr(StartIdx + Data.Outputs.find_first()); + OnlyOut->replaceUsesOutsideBlock(PatchupI, EntryBlock); + } else if (NumOutputs != 0) { + unsigned OutputNum = 0; + for (size_t OutputIdx : Data.Outputs) { + Instruction *Out = OM.getInstr(StartIdx + OutputIdx); + Value *Reload = + ExtractValueInst::Create(PatchupI, OutputNum++, "", OutlineBlock); + Out->replaceUsesOutsideBlock(Reload, EntryBlock); + } + } + + // An invoke occurrence already has a valid terminator. + if (!isa(Tail)) + BranchInst::Create(Exit, OutlineBlock); + + // Delete the dead entry block. + if (!InitialOccur) + EntryBlock->eraseFromParent(); + } + + // \brief Finalize this function as an outline instance. + void finalize(const OutlineCandidate &OC, IROutlinerMapper &OM) { + // Check to see if our tail is an invoke instruction. If so then we + // transform it into a call. + Instruction *Tail = OM.getInstr(InitialStartIdx + OC.Len - 1); + if (InvokeInst *II = dyn_cast(Tail)) { + SmallVector IIArgs(II->arg_operands()); + SmallVector Bundles; + II->getOperandBundlesAsDefs(Bundles); + CallInst *CI = + CallInst::Create(II->getCalledValue(), IIArgs, Bundles, "", II); + CI->setAttributes(II->getAttributes()); + CI->setCallingConv(II->getCallingConv()); + CI->setDebugLoc(II->getDebugLoc()); + CI->copyMetadata(*II); + II->replaceAllUsesWith(CI); + II->eraseFromParent(); + } + + // Re-unique the inputs to the function. + uniqueInputs(OM.getCandidateData(OC)); + + // Set the final function name. + OutlinedFn->setName(Twine("cso_") + Twine(NumOutlined++)); + + // Debug. + DEBUG(dbgs() << "** Outlining : " << OutlinedFn->getName() << "\n" + << " Candidate : " << OC.ID << "\n" + << " occurrences : " << OC.size() << "\n" + << " size : " << OC.Len << "\n" + << " benefit : " << OC.Benefit << "\n" + << " benefit per occurrence : " << OC.BenefitPerOccur << "\n"); + } + +private: + // Re-Unique the inputs for the outlined function. + // example: + // void outlinedFn(int, int); + // call 1: outlinedFn(a, a); + // call 2: outlinedFn(b, b); + // We identify that parameters 1 and 2 are the same + // for each callsite, so we condense them. + void uniqueInputs(AdditionalCandidateData &Data) { + FunctionType *CurFnTy = OutlinedFn->getFunctionType(); + unsigned NumInputs = CurFnTy->getNumParams(); + + // Map to + DenseMap ArgNoToCG, RhsArgNoToCG; + std::vector ArgCongruencyGroups, RhsArgCGroups; + + // Helper function to collect the information on the use of inputs in the + // outlined function. This identifies which arguments are actually congruent + // to each other for a specific function call. + auto CollectInputInfo = [&](CallSite &CS, + DenseMap &ANoToCG, + std::vector &ArgCGroups) { + ArgCGroups.reserve(NumInputs); + for (unsigned i = 0, e = NumInputs; i < e; ++i) { + // We already evaluated the equivalencies for this arg. + if (ANoToCG.count(i)) + continue; + Value *Op = CS.getArgOperand(i); + BitVector CurrentGroup(NumInputs); + CurrentGroup.set(i); + ANoToCG.try_emplace(i, ArgCGroups.size()); + for (unsigned j = i + 1; j < e; ++j) { + if (ANoToCG.count(j)) + continue; + if (CS.getArgOperand(j) == Op) { + CurrentGroup.set(j); + ANoToCG.try_emplace(j, ArgCGroups.size()); + } + } + ArgCGroups.emplace_back(std::move(CurrentGroup)); + } + }; + + // Build initial equivalences from the first call. + auto UserIt = OutlinedFn->user_begin(); + CallSite FirstCS(cast(*UserIt)); + ++UserIt; + CollectInputInfo(FirstCS, ArgNoToCG, ArgCongruencyGroups); + + // If we have the same amount of congruency groups as we do arguments, + // the they are already unique. + if (NumInputs == ArgCongruencyGroups.size()) + return; + // Check every other user to see if the equivalencies hold up. + BitVector ResolvedInputs(NumInputs); + // BitVector helper to hold non congruent matches between argument groups. + BitVector NonCongruentLeft; + for (auto UserE = OutlinedFn->user_end(); UserIt != UserE; ++UserIt) { + CallSite CS(cast(*UserIt)); + RhsArgCGroups.clear(); + RhsArgNoToCG.clear(); + CollectInputInfo(CS, RhsArgNoToCG, RhsArgCGroups); + ResolvedInputs.reset(); + for (unsigned i = 0, e = NumInputs; i < e; ++i) { + if (ResolvedInputs.test(i)) + continue; + BitVector &LhsGroup = ArgCongruencyGroups[ArgNoToCG[i]]; + BitVector &RhsGroup = RhsArgCGroups[RhsArgNoToCG[i]]; + + /// Build non congruent arguments between the groups. + NonCongruentLeft = LhsGroup; + /// Congruent matches. + LhsGroup &= RhsGroup; + assert(LhsGroup.count() > 0); + + // Non congruent sets on both sides still act as a congruency group. + NonCongruentLeft ^= LhsGroup; + + // Mark arguments as handled. + for (unsigned SetBit : LhsGroup.set_bits()) + ResolvedInputs.set(SetBit); + if (NonCongruentLeft.count() == 0) + continue; + + // Move the non congruent matches from the left group to a + // new congruency group. + unsigned NewGroupId = ArgCongruencyGroups.size(); + ArgCongruencyGroups.emplace_back(std::move(NonCongruentLeft)); + + // Move non congruent matches to a new congruency group + // and remove them from the top level mapping. + for (unsigned SetBit : ArgCongruencyGroups.back().set_bits()) + ArgNoToCG[SetBit] = NewGroupId; + } + } + + // No inputs can be condensed. + if (NumInputs == ArgCongruencyGroups.size()) + return; + + // Build new function from the condensed inputs. + std::vector NewFnTys; + for (auto &It : ArgCongruencyGroups) + NewFnTys.push_back(CurFnTy->getParamType(It.find_first())); + + // Create the merged function. + FunctionType *NewFnTy = + FunctionType::get(CurFnTy->getReturnType(), NewFnTys, false); + Function *MergedFn = Function::Create(NewFnTy, OutlinedFn->getLinkage(), "", + OutlinedFn->getParent()); + MergedFn->takeName(OutlinedFn); + MergedFn->copyAttributesFrom(OutlinedFn); + + /// Move Fn Body. + MergedFn->getBasicBlockList().splice(MergedFn->begin(), + OutlinedFn->getBasicBlockList()); + + // Remap the arguments. + for (Argument &Arg : OutlinedFn->args()) { + Value *Repl = std::next(MergedFn->arg_begin(), ArgNoToCG[Arg.getArgNo()]); + Arg.replaceAllUsesWith(Repl); + Repl->takeName(&Arg); + } + + // Rewrite the calls to this function with calls to the merged function. + std::vector CallArgs; + for (auto U = OutlinedFn->use_begin(), E = OutlinedFn->use_end(); U != E;) { + Instruction *OldI = cast(U->getUser()); + CallSite CS(OldI); + ++U; + CallArgs.clear(); + /// Map call args by their congruency group. + for (auto &It : ArgCongruencyGroups) + CallArgs.push_back(CS.getArgOperand(It.find_first())); + /// Create the new call. + Instruction *NewI; + if (CallInst *CI = dyn_cast(OldI)) { + CallInst *NewCall = CallInst::Create(MergedFn, CallArgs, "", CI); + NewCall->setCallingConv(CI->getCallingConv()); + NewCall->setTailCall(CI->isTailCall()); + NewI = NewCall; + } else if (InvokeInst *II = dyn_cast(OldI)) { + InvokeInst *NewCall = + InvokeInst::Create(MergedFn, II->getNormalDest(), + II->getUnwindDest(), CallArgs, "", II); + NewCall->setCallingConv(II->getCallingConv()); + NewI = NewCall; + } else + llvm_unreachable("Invalid callsite for outlined function."); + NewI->setDebugLoc(OldI->getDebugLoc()); + OldI->replaceAllUsesWith(NewI); + OldI->eraseFromParent(); + } + OutlinedFn->eraseFromParent(); + OutlinedFn = MergedFn; + } + + // \brief Outline the initial occurrence of this chain. + void outlineInitialOccurrence(const OutlineCandidate &OC, + IROutlinerMapper &OM, size_t StartIdx, + ArrayRef Args, BasicBlock *Entry) { + Function *ParentFn = Entry->getParent(); + + /// Function type for outlined function. + SmallVector Tys; + Tys.reserve(Args.size()); + for (Value *Arg : Args) + Tys.push_back(Arg->getType()); + FunctionType *FTy = FunctionType::get(OutputType, Tys, false); + + /// Create function and move outlined block. + OutlinedFn = Function::Create(FTy, GlobalValue::PrivateLinkage, "", + Entry->getModule()); + OutlinedFn->setCallingConv(CallingConv::Fast); + OutlinedFn->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + OutlinedFn->setAlignment(ParentFn->getAlignment()); + OutlinedFn->getBasicBlockList().splice( + OutlinedFn->end(), Entry->getParent()->getBasicBlockList(), + Entry->getIterator()); + + /// Inherit attributes. + LLVMContext &Ctx = OutlinedFn->getContext(); + AttributeSet AttrsToAdd = + ParentFn->getAttributes().getAttributes(AttributeList::FunctionIndex); + AttrsToAdd = AttrsToAdd.removeAttributes(Ctx, NonInheritAttrs); + AttrsToAdd = AttrsToAdd.removeAttribute(Ctx, Attribute::AllocSize); + OutlinedFn->addAttributes(AttributeList::FunctionIndex, AttrsToAdd); + + // Drop any uses of the outlined instructions as metadata. + for (unsigned InitI = StartIdx, InitE = StartIdx + OC.Len; InitI < InitE; + ++InitI) { + Instruction *InitII = OM.getInstr(InitI); + if (InitII->isUsedByMetadata()) + ValueAsMetadata::handleDeletion(InitII); + } + + // FIXME: Ideally we should compute the real count for this function but + // for now we just tag it as cold. + if (EmitProfileData) + OutlinedFn->addFnAttr(Attribute::Cold); + + AdditionalCandidateData &Data = OM.getCandidateData(OC); + + // Erase the current terminator if we don't have an invoke. + Instruction *CurTerm = Entry->getTerminator(); + if (!isa(CurTerm)) + CurTerm->eraseFromParent(); + + /// Create stores for any output variables. + Value *RetVal = nullptr; + unsigned NumOutputs = Data.Outputs.count(); + if (NumOutputs == 1) + RetVal = OM.getInstr(StartIdx + Data.Outputs.find_first()); + else if (NumOutputs != 0) { + RetVal = UndefValue::get(OutputType); + unsigned OutputNum = 0; + for (size_t OutputIdx : Data.Outputs) { + Instruction *Out = OM.getInstr(StartIdx + OutputIdx); + InsertValueInst *Insert = + InsertValueInst::Create(RetVal, Out, OutputNum++); + Insert->insertAfter(Out); + RetVal = Insert; + } + } + ReturnInst::Create(Ctx, RetVal, Entry); + + /// Replace input operands with function arguments. + auto ArgI = OutlinedFn->arg_begin(); + for (Input &I : Data.InputSeq) { + Instruction *InputInst = OM.getInstr(StartIdx + I.InstrNo); + InputInst->setOperand(I.OpNo, &*ArgI++); + } + + /// Insert the constant param fixups. + for (ConstantCondenseInstance &CInst : Data.ConstInsts) { + Input &BaseInput = CInst.BaseInput; + Value *CArg = + OM.getInstrOp(InitialStartIdx + BaseInput.InstrNo, BaseInput.OpNo); + Type *ConstantTy = CArg->getType(); + for (unsigned i = 0, e = CInst.Diffs.size(); i < e; ++i) { + Input &I = CInst.InputSeqs[i]; + Instruction *ReplI = OM.getInstr(InitialStartIdx + I.InstrNo); + APInt &Diff = CInst.Diffs[i]; + if (Diff.isNullValue()) { + ReplI->setOperand(I.OpNo, CArg); + continue; + } + Value *DiffVal = ConstantInt::get(ConstantTy, CInst.Diffs[i]); + Value *Repl = BinaryOperator::CreateAdd(CArg, DiffVal, "", ReplI); + ReplI->setOperand(I.OpNo, Repl); + } + } + + /// Cleanup for ignored instructions. + for (auto It = Entry->begin(), E = Entry->end(); It != E;) { + Instruction &I = *It; + ++It; + if (IntrinsicInst *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + default: + continue; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // FIXME: Fix this if we can have merged debug info. + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + break; + } + Entry->getInstList().erase(I.getIterator()); + } + } + } + + /// The function created after outlining. + Function *OutlinedFn = nullptr; + /// Function output type. + Type *OutputType = nullptr; + /// If we are emitting profile date during outlining. + bool EmitProfileData; + /// The index of the initial occurrence for the current splice. + unsigned InitialStartIdx; + /// Set of attributes that are not to be inherited from parent functions. + AttrBuilder NonInheritAttrs; + /// Number of functions that have been outlined. + unsigned NumOutlined; +}; + +/// \brief Perform analysis and verification for the found outline candidates. +struct OutlinerAnalysis { + OutlinerAnalysis(IROutlinerMapper &OM, + std::vector &CandidateList, + function_ref GetTTI, + const DataLayout *Layout) + : OM(OM), CandidateList(CandidateList), GetTTI(GetTTI), Layout(Layout) {} + + // Analyze our found candidates for benefit and correctness. + void analyzeCandidateList() { + // Generate instruction info for each instruction that is part + // of an occurrence. + BitVector OccurInstrInfo(OM.getNumMappedInstructions()); + for (OutlineCandidate &OC : CandidateList) + for (unsigned Occur : OC) + OccurInstrInfo.set(Occur, Occur + OC.Len); + for (unsigned Idx : OccurInstrInfo.set_bits()) + OM.createInstrInfo(Idx); + + // We verify the occurrences of each candidate are compatible. + verifyOccurrenceCompatability(); + + // Estimate the function type and benefit of each chain. + computeFunctionType(0, CandidateList.size()); + + // Analyze the currently profitable candidates for partitioning based + // upon equivalent inputs. + analyzeInputsForPartitioning(); + + // Estimate the benefit for each candidate. + computeOutlineBenefit(0, CandidateList.size()); + + // The last analysis is the register cost per occurrence. + // This is very costly so we do this after selecting the profitable + // candidates. + computeOutlineCallRegisterCost(); + } + +private: + // Helper cache for mapping which functions are compatible with each other. + struct CompatibleAttrCache { + unsigned getFnIdx(Function *F, + function_ref GetTTI) { + // Find an existing mapping of our function. + auto It = FnToIdx.find(F); + if (It != FnToIdx.end()) + return It->second; + + // Find a compatible function in the cache. + for (auto &MapIt : FnToIdx) { + if (!AttributeFuncs::areInlineCompatible(*F, *MapIt.first)) + continue; + TargetTransformInfo &TTI = GetTTI(*MapIt.first); + if (TTI.areInlineCompatible(F, MapIt.first)) { + FnToIdx[F] = MapIt.second; + return MapIt.second; + } + } + + // Insert our function. + FnToIdx[F] = CacheIdx; + return CacheIdx++; + } + + private: + DenseMap FnToIdx; + unsigned CacheIdx = 0; + }; + // Helper struct for verifying the compatibility of occurrences within a + // candidate. + struct VerifyInst { + VerifyInst(CompatibleAttrCache *Cache) : Cache(Cache) {} + VerifyInst() = default; + void reset(Function *F) { + Fn = F; + Indices.clear(); + } + bool compare(VerifyInst &R, + function_ref GetTTI) { + Function *LF = Fn, *RF = R.Fn; + if (LF != RF) + if (Cache->getFnIdx(LF, GetTTI) != Cache->getFnIdx(RF, GetTTI)) + return false; + return Indices == R.Indices; + } + + Function *Fn; + std::vector Indices; + CompatibleAttrCache *Cache; + }; + + // Verify the compatibilities of the occurrences in each candidate. + void verifyOccurrenceCompatability() { + // Vector of different verification leaders. + std::vector VerificationCands; + // Count of occurrences to an verification instance. + std::vector Counts; + // Maps occurrences to the input index they belong to. + std::vector OccurrenceInputIdx; + // Cache of attribute compatibility between functions. + CompatibleAttrCache Cache; + // Verification instance of the current occurrence being considered. + VerifyInst CurrentOccurVerifyInst(&Cache); + + size_t CurrentCandidateListSize = CandidateList.size(); + for (size_t i = 0, e = CurrentCandidateListSize; i < e; ++i) { + OutlineCandidate &OC = CandidateList[i]; + + // Compute the input sequence for this occurrence. + for (size_t Oi = OccurrenceInputIdx.size(), Oe = OC.size(); Oi < Oe; + ++Oi) { + unsigned Occur = OC.getOccurrence(Oi); + CurrentOccurVerifyInst.reset(OM.getInstrFunction(Occur)); + for (size_t InstrIdx = Occur, InstrE = Occur + OC.Len; + InstrIdx < InstrE; ++InstrIdx) { + InstructionInfo &II = OM.getInstrInfo(InstrIdx); + for (unsigned InIdx : II.InputIndexes) { + // We only really need to verify inputs coming from within the + // sequence. Other inputs simply help to verify the ordering. + unsigned InputIdx = InIdx < Occur ? -1 : InIdx - Occur; + CurrentOccurVerifyInst.Indices.push_back(InputIdx); + } + } + + // Check for existing mapping of this instance. + auto It = + std::find_if(VerificationCands.begin(), VerificationCands.end(), + [&](VerifyInst &L) { + return L.compare(CurrentOccurVerifyInst, GetTTI); + }); + if (It != VerificationCands.end()) { + size_t InternalInputIt = It - VerificationCands.begin(); + ++Counts[InternalInputIt]; + OccurrenceInputIdx.push_back(InternalInputIt); + continue; + } + unsigned OrigCapacity = CurrentOccurVerifyInst.Indices.capacity(); + OccurrenceInputIdx.push_back(VerificationCands.size()); + VerificationCands.emplace_back(std::move(CurrentOccurVerifyInst)); + CurrentOccurVerifyInst.Indices.reserve(OrigCapacity); + Counts.push_back(1); + } + + OutlineCandidate &Cur = CandidateList[i]; + size_t SharedSizeWithNext = Cur.SharedSizeWithNext; + size_t IdxOfNext = i + 1; + while (SharedSizeWithNext > 0 && !CandidateList[IdxOfNext].isValid()) + SharedSizeWithNext = CandidateList[IdxOfNext++].SharedSizeWithNext; + size_t FirstOccur = Cur.getOccurrence(0); + + // Only split if needed. + if (Counts.size() > 1) + splitOutlineChain(i, Counts, OccurrenceInputIdx); + + // If we share a size with the next chain then we do cleanup and set up to + // reduce the amount of work we need to do during the next iteration. + if (SharedSizeWithNext > 0 && CandidateList[IdxOfNext].isValid()) { + // Get the cut off point for moving to the next candidate. + size_t SharedCutOffPoint = 0; + for (size_t InstrIdx = FirstOccur, + InstrE = FirstOccur + SharedSizeWithNext; + InstrIdx < InstrE; ++InstrIdx) { + InstructionInfo &II = OM.getInstrInfo(InstrIdx); + SharedCutOffPoint += II.InputIndexes.size(); + } + + // Update the size of the internal inputs vectors. + for (size_t InIt = 0, InE = VerificationCands.size(); InIt < InE; + ++InIt) + VerificationCands[InIt].Indices.resize(SharedCutOffPoint); + + // Don't bother merging if there is only one instance. + if (Counts.size() == 1) + continue; + + // Set resolved occurrences that point to the first vector. + BitVector UnResolved(OccurrenceInputIdx.size(), true); + for (size_t i = 0, e = OccurrenceInputIdx.size(); i < e; ++i) + if (OccurrenceInputIdx[i] == 0) + UnResolved.reset(i); + // Condense the internal inputs vector in the case where two vectors are + // now equivalent given the smaller size. + size_t InsertIdx = 1; + for (size_t i = 1, e = VerificationCands.size(); i < e; ++i) { + VerifyInst &VI = VerificationCands[i]; + auto RemapOccurIdxs = [&](size_t Old, size_t New) { + for (size_t OccurIdx : UnResolved.set_bits()) { + if (OccurrenceInputIdx[OccurIdx] == Old) { + OccurrenceInputIdx[OccurIdx] = New; + UnResolved.reset(OccurIdx); + } + } + }; + // Try to remap to an existing instance. + auto Remap = [&]() -> bool { + for (size_t j = 0; j < InsertIdx; ++j) { + if (VerificationCands[j].compare(VI, GetTTI)) { + Counts[j] += Counts[i]; + RemapOccurIdxs(i, j); + return true; + } + } + return false; + }; + + // Update mapping with new instance. + if (!Remap()) { + if (i != InsertIdx) { + Counts[InsertIdx] = Counts[i]; + RemapOccurIdxs(i, InsertIdx); + VerificationCands[InsertIdx++] = std::move(VerificationCands[i]); + } else + ++InsertIdx; + } + } + VerificationCands.resize(InsertIdx); + Counts.resize(InsertIdx); + continue; + } + // Otherwise we just cleanup what we used. + VerificationCands.clear(); + Counts.clear(); + OccurrenceInputIdx.clear(); + } + } + + // Split the OutlineChain at index /p CurrentChainIndex into N different + // candidates with the provided memberships. + void splitOutlineChain(size_t CurrentChainIndex, + const std::vector &MembershipCounts, + const std::vector &OccurrenceInputIdx) { + OutlineCandidate *OrigChain = &CandidateList[CurrentChainIndex]; + SmallVector SplitChains(MembershipCounts.size(), -1); + size_t FirstValid = 0; + while (FirstValid < MembershipCounts.size() && + MembershipCounts[FirstValid] < MinOccurrences) + ++FirstValid; + if (FirstValid == MembershipCounts.size()) { + OrigChain->invalidate(); + OrigChain->Occurrences.clear(); + OrigChain->SharedSizeWithNext = 0; + return; + } + + SplitChains[FirstValid] = CurrentChainIndex; + + // Add new chains for each valid count after the first. + unsigned OrigLen = OrigChain->Len; + for (size_t i = FirstValid + 1, e = MembershipCounts.size(); i < e; ++i) { + size_t Count = MembershipCounts[i]; + if (Count < MinOccurrences) + continue; + SplitChains[i] = CandidateList.size(); + CandidateList.emplace_back(SplitChains[i], OrigLen); + OutlineCandidate &NewChain = CandidateList.back(); + NewChain.Occurrences.reserve(Count); + } + + // Move occurrences to their new parents. + OrigChain = &CandidateList[CurrentChainIndex]; + for (size_t i = 0, e = OrigChain->size(); i < e; ++i) { + size_t NewParentIdx = SplitChains[OccurrenceInputIdx[i]]; + if (NewParentIdx != CurrentChainIndex && NewParentIdx != size_t(-1)) { + OutlineCandidate &NewParent = CandidateList[NewParentIdx]; + NewParent.Occurrences.push_back(OrigChain->Occurrences[i]); + } + } + for (ssize_t i = OrigChain->size() - 1; i >= 0; --i) + if (OccurrenceInputIdx[i] != FirstValid) + OrigChain->removeOccurrence(i); + + // Update shared size information. + if (CurrentChainIndex != 0) { + OutlineCandidate &Prev = CandidateList[CurrentChainIndex - 1]; + if (Prev.SharedSizeWithNext > 0 && + Prev.getOccurrence(0) != OrigChain->getOccurrence(0)) + Prev.SharedSizeWithNext = 0; + } + } + + // Estimate the function type of each outline chain in the given range. + void computeFunctionType(size_t StartIdx, size_t EndIdx) { + std::vector InputOperands; + BitVector FoldableInputs; + + // Create candidate data for each of the candidates. + OM.createCandidateData(EndIdx - StartIdx); + for (size_t i = StartIdx; i < EndIdx; ++i) { + OutlineCandidate &OC = CandidateList[i]; + if (OC.size() == 0) + continue; + AdditionalCandidateData &OCData = OM.getCandidateData(OC); + unsigned FirstOccur = *OC.begin(); + + // Compute the input sequence if needed. + if (OCData.InputSeq.empty()) { + // Inputs are operands that come from outside of the chain range. + OCData.InputSeq.reserve(OC.Len); + for (unsigned InstrIdx = FirstOccur, InstrE = FirstOccur + OC.Len; + InstrIdx < InstrE; ++InstrIdx) { + InstructionInfo &II = OM.getInstrInfo(InstrIdx); + for (unsigned i = 0, e = II.InputIndexes.size(); i < e; ++i) + if (II.InputIndexes[i] < FirstOccur) + OCData.InputSeq.emplace_back(InstrIdx - FirstOccur, i); + } + computeCandidateOutputs(OC, OCData); + + // After computing we also compute the input sequences of chains + // that we share size with. We have already calculated the input + // sequence for this chain as it is a subset of our current + // sequence. + OutlineCandidate *Cur = &OC; + while (Cur->SharedSizeWithNext > 0) { + OutlineCandidate *Next = std::next(Cur); + AdditionalCandidateData &CurData = OM.getCandidateData(*Cur); + AdditionalCandidateData &NextData = OM.getCandidateData(*Next); + computeCandidateOutputs(*Next, NextData); + for (unsigned i = 0, e = CurData.InputSeq.size(); i < e; ++i) { + if (CurData.InputSeq[i].InstrNo >= Cur->SharedSizeWithNext) { + NextData.InputSeq.assign(CurData.InputSeq.begin(), + CurData.InputSeq.begin() + i); + break; + } + } + Cur = Next; + } + } + + // Verify the outputs. + auto HasInvalidOutput = [&]() -> bool { + // FIXME: Non invoke outputs may need special handling for dominance. + unsigned NumOutputs = OCData.Outputs.count(); + if (isa(OM.getInstr(FirstOccur + OC.Len - 1))) { + if (NumOutputs > 1) + return true; + if (NumOutputs == 1 && !OCData.Outputs.test(OC.Len - 1)) + return true; + } + return false; + }; + if (HasInvalidOutput()) { + OC.Occurrences.clear(); + OC.SharedSizeWithNext = 0; + continue; + } + + // Check to see if we share candidates with our predecessor. If we + // do then we can avoid rechecking candidates that we already have + // information for. + size_t SharedOccurrencesWithPrev = 1; + if (i > StartIdx) { + OutlineCandidate &Prev = CandidateList[i - 1]; + if (Prev.SharedSizeWithNext > 0) + SharedOccurrencesWithPrev = Prev.size(); + } + + // Try to constant fold inputs into the candidate. + constantFoldInputs(FoldableInputs, OC, SharedOccurrencesWithPrev, + InputOperands); + + // Condense any remaining constant inputs if it is profitable. + condenseConstantIntInputs(OC); + } + } + + // Remove any inputs into the candidate that will be constant folded. + void constantFoldInputs(BitVector &FoldableInputs, OutlineCandidate &OC, + size_t SharedOccurrencesWithPrev, + std::vector &InputOperands) { + AdditionalCandidateData &OCData = OM.getCandidateData(OC); + unsigned FirstOccur = *OC.begin(); + + // Only recompute the input operands if we didn't share a size with the + // previous chain. + if (SharedOccurrencesWithPrev == 1) { + // Get the operands for the first candidate. + InputOperands.clear(); + FoldableInputs.set(); + FoldableInputs.resize(OCData.InputSeq.size(), true); + for (size_t i = 0, e = OCData.InputSeq.size(); i < e; ++i) { + Input &I = OCData.InputSeq[i]; + Value *Op = OM.getInstrOp(FirstOccur + I.InstrNo, I.OpNo); + InputOperands.push_back(Op); + if (!isa(Op) && !isa(Op)) + FoldableInputs.reset(i); + } + } else { + InputOperands.resize(OCData.InputSeq.size(), nullptr); + FoldableInputs.resize(InputOperands.size()); + } + + // Check to see which inputs will be folded. + auto OccurRange = + make_range(OC.begin() + SharedOccurrencesWithPrev, OC.end()); + for (unsigned InputNo : FoldableInputs.set_bits()) { + Input &I = OCData.InputSeq[InputNo]; + Value *IOp = InputOperands[InputNo]; + if (any_of(OccurRange, [&](unsigned Occur) { + return OM.getInstrOp(Occur + I.InstrNo, I.OpNo) != IOp; + })) + FoldableInputs.reset(InputNo); + } + + // Remove all of the inputs that will be folded. + erase_if(OCData.InputSeq, [&](Input &I) -> bool { + return FoldableInputs.test(&I - OCData.InputSeq.data()); + }); + } + + // Condense constant inputs that share a common operational difference + // between occurrences. + void condenseConstantIntInputs(OutlineCandidate &OC) { + AdditionalCandidateData &OCData = OM.getCandidateData(OC); + if (!OCData.ConstInsts.empty()) + return; + + // Compute the set of inputs that are always constant. + unsigned NumConstantIntInputs = OCData.InputSeq.size(); + if (NumConstantIntInputs < 2) + return; + BitVector ConstantInputs(NumConstantIntInputs, true); + for (unsigned InputNo = 0, InputE = ConstantInputs.size(); InputNo < InputE; + ++InputNo) { + Input &I = OCData.InputSeq[InputNo]; + for (unsigned Occur : OC) { + Value *InputOp = OM.getInstrOp(Occur + I.InstrNo, I.OpNo); + if (!isa(InputOp)) { + ConstantInputs.reset(InputNo); + if (--NumConstantIntInputs == 1) + return; + break; + } + } + } + + // Mapping between constant input number to function input number. + std::vector ConstNumToFnInputMap; + for (unsigned CInput : ConstantInputs.set_bits()) + ConstNumToFnInputMap.push_back(CInput); + + // Current instance variables. + SmallVector InstDiff(ConstantInputs.count()); + BitVector InstMemberships(InstDiff.size()); + + // The constant inputs that are condensed and need to be removed. + BitVector RemovedInputs(ConstantInputs.size()); + + // Iterate over the active constant parameters. + auto InputI = ConstantInputs.set_bits_begin(), + InputE = ConstantInputs.set_bits_end(); + for (unsigned ConstIntNum = 0; InputI != InputE; ++InputI, ++ConstIntNum) { + // Don't recheck already handled inputs. + if (RemovedInputs.test(*InputI)) + continue; + + InstMemberships.reset(); + InstDiff.clear(); + + // Get the input sequence attached this param. + Input &I = OCData.InputSeq[*InputI]; + + // Run over each occurrence to compute the difference between the + // operands. + bool Success = true; + for (unsigned i = 0, e = OC.size(); i < e; ++i) { + bool InitialCollect = i == 0; + unsigned Occur = OC.getOccurrence(i); + + // Get the base parameter we are working off of. + Value *InputOp = OM.getInstrOp(Occur + I.InstrNo, I.OpNo); + Type *OpTy = InputOp->getType(); + const APInt &FirstConstVal = cast(InputOp)->getValue(); + + // Compute the difference for each input. + auto NextInputI = InputI; + ++NextInputI; + for (unsigned NextCIntNum = 0; NextInputI != InputE; + ++NextInputI, ++NextCIntNum) { + + // Already failed member. + if (!InitialCollect && + !InstMemberships.test(NextCIntNum + ConstIntNum + 1)) + continue; + + // Already set to be removed. + if (InitialCollect && RemovedInputs.test(*NextInputI)) { + InstDiff.emplace_back(APInt()); + continue; + } + + // Get the next constant operand. + Input &NextI = OCData.InputSeq[*NextInputI]; + Value *NextOp = OM.getInstrOp(Occur + NextI.InstrNo, NextI.OpNo); + ConstantInt *NextConstI = cast(NextOp); + + // Currently only consider constants of the same type. + if (NextConstI->getType() != OpTy) { + InstDiff.emplace_back(APInt()); + continue; + } + + // Compute the difference. + const APInt &NextConstVal = NextConstI->getValue(); + APInt Diff = NextConstVal - FirstConstVal; + + // Initial collection adds all diffs. + if (InitialCollect) { + InstDiff.emplace_back(Diff); + InstMemberships.set(NextCIntNum + ConstIntNum + 1); + } else if (InstDiff[NextCIntNum] != Diff) + InstMemberships.reset(NextCIntNum + ConstIntNum + 1); + } + // No condensable constants. + if (InstMemberships.none()) { + Success = false; + break; + } + } + // No condensable constants. + if (!Success) + continue; + + // Create the new constant inst. + Input &BaseInputSeq = OCData.InputSeq[ConstNumToFnInputMap[ConstIntNum]]; + OCData.ConstInsts.emplace_back(BaseInputSeq, InstMemberships.count()); + ConstantCondenseInstance &CInst = OCData.ConstInsts.back(); + + // Add the input locations. + SmallSet, 8> UniqueDiffs; + unsigned ConstInputStart = ConstIntNum + 1; + for (unsigned ConstInput : InstMemberships.set_bits()) { + unsigned InputNo = ConstNumToFnInputMap[ConstInput]; + CInst.InputSeqs.push_back(OCData.InputSeq[InputNo]); + APInt &Diff = InstDiff[ConstInput - ConstInputStart]; + CInst.Diffs.emplace_back(Diff); + RemovedInputs.set(InputNo); + + // Add this diff. + if (!Diff.isNullValue()) { + SmallString<64> S; + Diff.toStringSigned(S); + UniqueDiffs.insert(S); + } + } + + // Add the amount of add instructions needed for this instance. + CInst.Cost = UniqueDiffs.size(); + } + + // No condensable inputs. + if (OCData.ConstInsts.empty()) + return; + + // Remove the dead inputs. + for (unsigned i = 0, e = ConstantInputs.size(), InputNo = 0; i < e; ++i) { + if (RemovedInputs.test(i)) + OCData.InputSeq.erase(OCData.InputSeq.begin() + InputNo); + else + ++InputNo; + } + } + + // Analyzes the input parameters to each candidate to see if we can get + // can get a more profitable chain by only considering some candidates. + void analyzeInputsForPartitioning() { + size_t CurrentListSize = CandidateList.size(); + SmallVector NewCandidates; + SmallDenseMap MembershipVals; + std::vector Memberships; + unsigned CurMembershipNum = 0; + for (size_t i = 0; i < CurrentListSize; ++i) { + OutlineCandidate &OC = CandidateList[i]; + AdditionalCandidateData &Data = OM.getCandidateData(OC); + + /// No inputs. + if (OC.size() == 0 || Data.InputSeq.empty()) + continue; + + // Lambda helper for adding a member. + auto AddMember = [&](Value *C, unsigned Occur) { + auto It = MembershipVals.find(C); + if (It == MembershipVals.end()) { + if (CurMembershipNum == Memberships.size()) { + BitVector New(OC.size()); + New.set(Occur); + Memberships.emplace_back(std::move(New)); + MembershipVals.try_emplace(C, CurMembershipNum++); + return; + } + MembershipVals.try_emplace(C, CurMembershipNum); + BitVector &Cur = Memberships[CurMembershipNum++]; + Cur.reset(); + Cur.resize(OC.size()); + Cur.set(Occur); + return; + } + BitVector &Cur = Memberships[It->second]; + Cur.set(Occur); + }; + + // Find the subsets of candidates that have the same constant input. + NewCandidates.clear(); + for (size_t InputI = 0, InputE = Data.InputSeq.size(); InputI < InputE; + ++InputI) { + Input &In = Data.InputSeq[InputI]; + + // Reset the memberships. + CurMembershipNum = 0; + MembershipVals.clear(); + + // Add each constant input to a membership set. + for (unsigned OccurI = 0, OccurE = OC.size(); OccurI < OccurE; + ++OccurI) { + unsigned Occur = OC.getOccurrence(OccurI); + Value *IOp = OM.getInstrOp(Occur + In.InstrNo, In.OpNo); + if (isa(IOp)) + AddMember(IOp, OccurI); + } + + // Add the memberships that will have a folded input. + for (unsigned i = 0; i < CurMembershipNum; ++i) { + BitVector &Members = Memberships[i]; + // Only one occurrence. + if (Members.count() < 2) + continue; + // Already seen. + auto It = + std::find(NewCandidates.begin(), NewCandidates.end(), Members); + if (It == NewCandidates.end()) + NewCandidates.emplace_back(Members); + } + } + + unsigned OrigLen = OC.Len; + for (BitVector &NewCand : NewCandidates) { + size_t NewChainIdx = CandidateList.size(); + CandidateList.emplace_back(NewChainIdx, OrigLen); + OutlineCandidate &OGChain = CandidateList[i]; + OutlineCandidate &NewChain = CandidateList.back(); + NewChain.Occurrences.reserve(NewCand.count()); + for (unsigned OccurIdx : NewCand.set_bits()) + NewChain.Occurrences.push_back(OGChain.getOccurrence(OccurIdx)); + } + } + computeFunctionType(CurrentListSize, CandidateList.size()); + } + + // Compute the external outputs of a given candidate. + void computeCandidateOutputs(OutlineCandidate &OC, + AdditionalCandidateData &Data) { + // Outputs are internal instructions that have uses outside of the chain + // range. + Data.Outputs.clear(); + for (unsigned i = 0; i < OC.Len; ++i) { + if (any_of(OC, [&](unsigned Occur) { + InstructionInfo &II = OM.getInstrInfo(Occur + i); + return II.FarthestInSameBlockOutput >= Occur + OC.Len; + })) + Data.Outputs.set(i); + } + } + + // Computes the estimated benefit of a set of potential functions. + void computeOutlineBenefit(size_t StartIdx, size_t EndIdx) { + SmallDenseSet UniqueInputOperands; + SmallPtrSet MaterializedValues; + for (size_t i = StartIdx; i < EndIdx; ++i) { + OutlineCandidate &OC = CandidateList[i]; + AdditionalCandidateData &Data = OM.getCandidateData(OC); + + /// Reset benefit metrics. + OC.invalidate(); + + // Sanity check. + unsigned NumOccurences = OC.size(); + if (NumOccurences < MinOccurrences) + continue; + DEBUG(dbgs() << "\nCandidate : " << OC.ID << "\n"); + DEBUG(dbgs() << "Num : " << NumOccurences << "; Len : " << OC.Len + << "\n"); + + /// Use the first occurrence as an example for cost analysis. + unsigned FirstOccur = *OC.begin(); + /// Cost for each occurrence of the candidate. + unsigned CostPerOccurence = 0; + /// Cost for the outlined function. + unsigned NewFunctionCost = 0; + /// If this outline sequence contains a call. + bool ContainsCall = false; + + /// New function contains a return. + NewFunctionCost += 1; + MaterializedValues.clear(); + + // Estimate the cost of this chain of instructions. + Function *FirstOccurFn = OM.getInstrFunction(FirstOccur); + TargetTransformInfo &TTI = GetTTI(*FirstOccurFn); + unsigned WidestRegister = TTI.getRegisterBitWidth(false); + unsigned ChainCost = 0; + for (size_t i = 0, e = OC.Len, InstIdx = FirstOccur; i < e; + ++i, ++InstIdx) { + ChainCost += OM.getInstrCost(TTI, InstIdx); + Instruction *I = OM.getInstr(InstIdx); + + // Add in cost for constant materialization. + for (Value *Op : I->operands()) { + if (ConstantFP *CF = dyn_cast(Op)) { + if (!MaterializedValues.insert(CF).second) + continue; + if (CF->isZero()) + continue; + ++NewFunctionCost; + if (CF->isNegative()) + ++NewFunctionCost; + } else if (ConstantDataVector *CV = + dyn_cast(Op)) { + if (CV->isZeroValue()) + continue; + if (!CV->getType()->getScalarType()->isFloatingPointTy()) + continue; + if (MaterializedValues.insert(CV).second) + ++NewFunctionCost; + } + } + + switch (I->getOpcode()) { + default: + break; + case Instruction::Invoke: + case Instruction::Call: { + ContainsCall = true; + + // If we tail then we don't need a return. + if (i == e - 1) + --NewFunctionCost; + break; + } + case Instruction::Xor: + case Instruction::Or: { + auto &Info = OM.getInstrInfo(InstIdx); + Instruction *L = dyn_cast(I->getOperand(0)); + Instruction *R = dyn_cast(I->getOperand(1)); + if (!L || !R) + break; + size_t TotalVal = 0; + if (L->getOpcode() == Instruction::LShr) { + if (R->getOpcode() != Instruction::Shl) + break; + } else if (L->getOpcode() == Instruction::Shl) { + if (R->getOpcode() != Instruction::LShr) + break; + } else + break; + + // Helper to get a constant value if it exists. + auto AccumulateConstantIntVal = [&](Value *V) -> bool { + ConstantInt *CI = dyn_cast(V); + if (CI) + TotalVal += CI->getZExtValue(); + return CI; + }; + + if (!AccumulateConstantIntVal(L->getOperand(0)) && + !AccumulateConstantIntVal(L->getOperand(1))) + break; + if (!AccumulateConstantIntVal(R->getOperand(0)) && + !AccumulateConstantIntVal(R->getOperand(1))) + break; + Type *Ty = I->getType(); + if (Layout->getTypeSizeInBits(Ty) == TotalVal) { + unsigned FoldedCost = TargetTransformInfo::TCC_Basic * 2; + // If they are inputs then we add cost for removing this + // folding oppurtunity. + if (Info.InputIndexes[0] < FirstOccur || + Info.InputIndexes[1] < FirstOccur) { + CostPerOccurence += FoldedCost; + } + // If they are then we remove the folded cost. + else + ChainCost -= FoldedCost; + } + break; + } + } + } + + /// Estimate inputs/outputs to this function. + unsigned TotalParamSize = 0; + unsigned NumPtrInputs = 0; + + // Estimate using first occurrence. + UniqueInputOperands.clear(); + for (Input &In : Data.InputSeq) { + Instruction *I = OM.getInstr(FirstOccur + In.InstrNo); + + // Check for a function call being turned into a call + // to a function pointer. + CallSite CS(I); + if (CS && In.OpNo == CS.getNumArgOperands()) + NewFunctionCost += TargetTransformInfo::TCC_Basic; + + Value *IOp = I->getOperand(In.OpNo); + // Add the operand to the unique set for the call. + bool Inserted = UniqueInputOperands.insert(IOp).second; + if (!Inserted) + continue; + // Check for a pointer that will need to be materialized. + Type *IOpTy = IOp->getType(); + if (IOpTy->isPointerTy()) + ++NumPtrInputs; + // Keep track of the total size of the inputs to the outlined + // function, we will use this for the cost of the call itself. + TotalParamSize += getTypeSize(*Layout, IOpTy, WidestRegister); + + // Constant operands may have additional cost. + bool HasFreeAddressComp = TTI.getAddressComputationCost(IOpTy) == 0; + Constant *COp = dyn_cast(IOp); + if (COp && !HasFreeAddressComp) + ChainCost -= getGlobalValueCost(COp); + } + + /// The new function contains one instance of the chain of instructions. + NewFunctionCost += ChainCost; + // Add the cost of constant materialization. + NewFunctionCost += MaterializedValues.size(); + + /// Add the cost for each output. + unsigned CostFromReLoad = 0; + for (size_t OutputIdx : Data.Outputs) { + Type *ParamEleTy = OM.getInstr(FirstOccur + OutputIdx)->getType(); + unsigned EstCost = + getRegisterUsage(*Layout, ParamEleTy, WidestRegister); + + /// There will be a store into this variable in the outlined function. + NewFunctionCost += EstCost; + + /// Each output value has a reload in the parent function. + /// NOTE: This isn't entirely true if a specific instance doesn't use + /// the value. + CostFromReLoad += EstCost; + } + + // Estimate the number of registers used in the function call. + unsigned NumCallRegisters = TotalParamSize / WidestRegister; + unsigned NumRegisters = TTI.getNumberOfRegisters(false); + + /// A call is generated at each occurence. + /// = call instruction + prepare each parameter + reload outputs. + CostPerOccurence += 1 + NumCallRegisters + CostFromReLoad; + + DEBUG(dbgs() << "Inputs : " << Data.InputSeq.size() << "[" + << UniqueInputOperands.size() << "]" + << "; Outputs : " << Data.Outputs.count() << "\n"); + DEBUG(dbgs() << "Chain Cost : " << ChainCost << "\n"); + DEBUG(dbgs() << "CostPerOccur : " << CostPerOccurence << "\n"); + + // No possibility of benefit. + if (CostPerOccurence >= ChainCost) + continue; + + /// Add the cost of the constant input condensing. + /// Each of the fixups will likely result in an add instruction. + for (ConstantCondenseInstance &CInst : Data.ConstInsts) { + if (CInst.Cost == 0) + continue; + Input &I = CInst.InputSeqs.front(); + Value *Op = OM.getInstrOp(FirstOccur + I.InstrNo, I.OpNo); + unsigned OpCost = + getRegisterUsage(*Layout, Op->getType(), WidestRegister); + NewFunctionCost += (CInst.Cost * OpCost); + } + + // Penalize large amounts of materialized pointers. + // There are likely to be a number of spills so we conservatively say + // that each new input will spill. + if (NumPtrInputs > NumRegisters) + NewFunctionCost += NumRegisters + (NumPtrInputs - NumRegisters); + + // Compute the benefit of the chain and each occurrence. + OC.BenefitPerOccur = ChainCost - CostPerOccurence; + unsigned OutlineBenefit = OC.BenefitPerOccur * NumOccurences; + + // Outline cost statistics. + DEBUG(dbgs() << "BenefitPerOccur : " << OC.BenefitPerOccur << "\n"); + DEBUG(dbgs() << "NewFunctionCost : " << NewFunctionCost << "\n"); + DEBUG(dbgs() << "ParameterCost : " << NumCallRegisters << "\n"); + + // No benefit. + if (OutlineBenefit <= NewFunctionCost) + continue; + + // As a final step we add in estimations for register pressure. We do + // this last because it can be costly to compute and is unnecessary + // if the candidate is already not going to be profitable. + unsigned RegisterCost = computeOutlinedFunctionRegisterCost(OC); + + // We have extra setup and save cost if we preserve the frame pointer. + bool ElimNonLeaf = + FirstOccurFn->hasFnAttribute("no-frame-pointer-elim-non-leaf"); + bool NeedsFramePointer = false; + // Check for non leaf frame pointer elim. + if (ElimNonLeaf) + NeedsFramePointer = ContainsCall; + // Check for leaf frame pointer elim. + else if (FirstOccurFn->hasFnAttribute("no-frame-pointer-elim")) { + Attribute Attr = FirstOccurFn->getFnAttribute("no-frame-pointer-elim"); + NeedsFramePointer = Attr.getValueAsString() == "true"; + } + // Accumulate the frame pointer cost. + if (NeedsFramePointer) + // save + register push/pop + RegisterCost += 3; + + NewFunctionCost += RegisterCost; + DEBUG(dbgs() << "Estimated register cost : " << RegisterCost << "\n"); + + // No benefit. + if (OutlineBenefit <= NewFunctionCost) + continue; + + Data.NumCallRegisters = NumCallRegisters; + OC.Benefit = OutlineBenefit - NewFunctionCost; + + // Less than desired benefit. + if (OC.Benefit < MinBenefit) { + OC.invalidate(); + continue; + } + } + } + + unsigned computeOutlinedFunctionRegisterCost(OutlineCandidate &OC) { + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // instruction that is the key. + typedef DenseMap IntervalMap; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of instruction indices that are used in the loop. + SmallSet Ends; + + unsigned FirstOccur = *OC.begin(); + for (unsigned i = 0, InstIdx = FirstOccur; i < OC.Len; ++i, ++InstIdx) { + // Save the end location of each USE. + InstructionInfo &II = OM.getInstrInfo(InstIdx); + for (unsigned InputIdx : II.InputIndexes) { + // 0 denotes a constant or external input. + if (InputIdx == 0) + continue; + EndPoint[InputIdx] = i; + Ends.insert(InputIdx); + } + } + + // Saves the list of intervals that end with the index in 'key'. + typedef SmallVector InstrList; + DenseMap TransposeEnds; + + // Transpose the EndPoints to a list of values that end at each index. + for (auto &Interval : EndPoint) + TransposeEnds[Interval.second].push_back(Interval.first); + + Function *F = OM.getInstrFunction(*OC.begin()); + TargetTransformInfo &TTI = GetTTI(*F); + unsigned WidestRegister = TTI.getRegisterBitWidth(false); + unsigned NumRegisters = TTI.getNumberOfRegisters(false); + unsigned SpillCount = 0; + SmallSet OpenIntervals; + for (unsigned i = 0, InstIdx = FirstOccur; i < OC.Len; ++i, ++InstIdx) { + // Remove all of the instructions that end at this location. + InstrList &List = TransposeEnds[i]; + for (unsigned ToRemove : List) + OpenIntervals.erase(OM.getInstr(ToRemove)); + + InstructionInfo &II = OM.getInstrInfo(InstIdx); + if (II.Cost == TargetTransformInfo::TCC_Free) + continue; + + // Count the number of live intervals. + unsigned LiveIntervals = 0; + for (auto Inst : OpenIntervals) + LiveIntervals += + getRegisterUsage(*Layout, Inst->getType(), WidestRegister); + + // Check for spills. + if (LiveIntervals > NumRegisters) { + SpillCount += LiveIntervals - NumRegisters; + for (unsigned i = 0, e = LiveIntervals - NumRegisters; i < e; ++i) + OpenIntervals.erase(*OpenIntervals.begin()); + } + + // Add the current instruction to the list of open intervals. + OpenIntervals.insert(OM.getInstr(InstIdx)); + } + return SpillCount; + } + + void computeOutlineCallRegisterCost() { + DenseMap> BBsToCompute; + DenseMap OccurToUsage; + + // Collect all of the occurrences and blocks that need to have their + // usages computed. + for (OutlineCandidate &OC : CandidateList) { + if (!OC.isValid()) + continue; + for (unsigned Occur : OC) { + BasicBlock *Par = OM.getInstr(Occur)->getParent(); + BBsToCompute[Par].insert(Occur); + } + } + + // Nothing to compute. + if (BBsToCompute.empty()) + return; + + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // instruction that is the key. + typedef DenseMap IntervalMap; + // Marks the end of each interval. + IntervalMap EndPoint; + + BasicBlock *FirstBB = BBsToCompute.begin()->first; + TargetTransformInfo &TTI = GetTTI(*FirstBB->getParent()); + unsigned WidestRegister = TTI.getRegisterBitWidth(false); + unsigned NumRegisters = TTI.getNumberOfRegisters(false); + + // Perform a similar register cost to computeOutlinedFunctionRegisterCost. + SmallSetVector BBInsts; + SmallPtrSet BBLiveins; + for (auto &BBPair : BBsToCompute) { + // Reset previous usage. + EndPoint.clear(); + BBInsts.clear(); + BBLiveins.clear(); + + // Collect the instructions in this block that we want to compute usages + // for. + BasicBlock *Par = BBPair.first; + for (unsigned Occur : BBPair.second) + BBInsts.insert(OM.getInstr(Occur)); + Instruction *FarthestInstr = BBInsts.back(); + auto BBE = FarthestInstr->getIterator(); + ++BBE; + + // Build the live ranges for the instructions in the block. + for (auto BBI = Par->begin(); BBI != BBE; ++BBI) { + // Add the liveins to the occurrence usage. + for (Value *Op : BBI->operands()) { + Instruction *IOp = dyn_cast(Op); + if (IOp && IOp->getParent() == Par) + EndPoint[IOp] = &*BBI; + } + } + + // Saves the list of intervals that end with the index in 'key'. + typedef SmallVector InstrList; + DenseMap TransposeEnds; + + // Remove the instructions that are live out. + for (auto BBI = Par->begin(); BBI != BBE; ++BBI) + if (BBI->isUsedOutsideOfBlock(Par)) + EndPoint.erase(&*BBI); + + // Transpose the EndPoints to a list of values that end at each index. + for (auto &Interval : EndPoint) + TransposeEnds[Interval.second].push_back(Interval.first); + + SmallSet OpenIntervals; + for (auto BBI = Par->begin(); BBI != BBE; ++BBI) { + Instruction *I = &*BBI; + + // Remove all of the instructions that end at this location. + InstrList &List = TransposeEnds[I]; + for (Instruction *ToRemove : List) + OpenIntervals.erase(ToRemove); + + // Small helper lambda for computing the register usage. + auto GetRegUsage = [&]() { + unsigned RegUsage = 0; + for (auto Inst : OpenIntervals) + RegUsage += + getRegisterUsage(*Layout, Inst->getType(), WidestRegister); + return RegUsage; + }; + + // Count the number of live intervals. + if (BBInsts.count(I)) + OccurToUsage[OM.getInstrIdx(I)] += GetRegUsage(); + + // Add the current instruction to the list of open intervals. + if (!I->use_empty()) + OpenIntervals.insert(I); + } + } + + DEBUG(dbgs() << "\n * Estimating spill cost per occurrence *\n"); + DEBUG(dbgs() << " - Num Registers : " << NumRegisters << "\n"); + unsigned ViableRegisters = NumRegisters; + + // After we have the usages we rewalk the outline candidates and + // incorporate that info into the cost model. + for (OutlineCandidate &OC : CandidateList) { + if (!OC.isValid()) + continue; + DEBUG(dbgs() << "\nCandidate : " << OC.ID << "\n"); + AdditionalCandidateData &Data = OM.getCandidateData(OC); + unsigned NumOutputs = Data.Outputs.count(); + for (unsigned i = 0, e = OC.size(); i < e; ++i) { + unsigned Occur = OC.getOccurrence(i); + // Get the max usage for the beginning of this call. + unsigned Usage = OccurToUsage[Occur]; + + // Add in the liveins from function parameters. + BBLiveins.clear(); + Instruction *OccurI = OM.getInstr(Occur); + BasicBlock *OccurPar = OccurI->getParent(); + for (Input &In : Data.InputSeq) { + Value *Op = OM.getInstrOp(Occur + In.InstrNo, In.OpNo); + Instruction *IOp = dyn_cast(Op); + if (IOp && IOp->getParent() != OccurPar) + if (BBLiveins.insert(IOp).second) + ++Usage; + } + + // Estimate any spills. + unsigned TotalUsage = Usage + NumOutputs + Data.NumCallRegisters; + unsigned Cost = 0; + if (TotalUsage > ViableRegisters) + Cost = TotalUsage - ViableRegisters; + + DEBUG(dbgs() << "Occur " << i << "\n"); + DEBUG(dbgs() << " - Usage : " << Usage << "\n"); + DEBUG(dbgs() << " - Call Registers : " << Data.NumCallRegisters + << "\n"); + DEBUG(dbgs() << " - Outputs : " << NumOutputs << "\n"); + DEBUG(dbgs() << " - Cost : " << Cost << "\n"); + + if (OC.Benefit <= Cost) { + OC.invalidate(); + break; + } + OC.Benefit -= Cost; + } + } + } + + /// Mapper containing information for the current module. + IROutlinerMapper &OM; + /// Our current list of candidates. + std::vector &CandidateList; + /// Functor for getting the target information for a function. + function_ref GetTTI; + /// Current Module data layout. + const DataLayout *Layout; +}; + +// \brief Outline all of the profitable candidates. +void outlineCandidates(MutableArrayRef CL, + IROutlinerMapper &OM, bool HasProfileData) { + FunctionSplicer FS(HasProfileData); + for (const OutlineCandidate &OC : CL) { + if (!OC.isValid()) + continue; + ++NumCandidatesOutlined; + + // Prepare->Outline->Finalize + FS.prepareForNewCandidate(OC, OM); + for (unsigned Occur : OC) + FS.outlineOccurrence(OC, Occur, OM); + FS.finalize(OC, OM); + } +} + +// Run the outliner over the provided module /p M. +bool runImpl(Module &M, ProfileSummaryInfo *PSI, + function_ref GetBFI, + function_ref GetTTI) { + IROutlinerMapper OM; + std::vector CCVec = llvm::mapIRModule(OM, M, PSI, GetBFI); + OM.initAdditionalData(); + + // No potential candidates. + if (CCVec.size() < MinOccurrences * MinInstructionLength) + return false; + + // Pre prune functor for any known unprofitable candidates. + std::function, unsigned)> PrePruneFn = + [&](ArrayRef Occurs, unsigned Len) { + if (Len != 1) + return false; + Instruction *I = OM.getInstr(Occurs[0]); + // Empty function call. + if (CallSite CS = CallSite(I)) + return CS.getCalledFunction() && CS.getNumArgOperands() == 0; + // Non constant operation. + return none_of(I->operand_values(), + [](Value *V) { return isa(V); }); + }; + + // Find outlining candidates. + std::vector CL = findSequentialOutliningCandidates( + PrePruneFn, CCVec, MinInstructionLength, MinOccurrences); + if (CL.empty()) + return false; + + // Analyze candidate profitability. + OutlinerAnalysis OAI(OM, CL, GetTTI, &M.getDataLayout()); + OAI.analyzeCandidateList(); + + // Prune any overlaps. + if (!pruneSequentialOutlineCandidateList(CL, OM.getNumMappedInstructions())) + return false; + + // Outline the profitable candidates. + outlineCandidates(CL, OM, PSI->hasProfileSummary()); + return true; +} +} // namespace + +PreservedAnalyses CodeSizeOutlinerPass::run(Module &M, + ModuleAnalysisManager &AM) { + auto &FAM = AM.getResult(M).getManager(); + std::function GetTTI = + [&](Function &F) -> TargetTransformInfo & { + return FAM.getResult(F); + }; + std::function GetBFI = + [&](Function &F) -> BlockFrequencyInfo & { + return FAM.getResult(F); + }; + ProfileSummaryInfo *PSI = &AM.getResult(M); + return runImpl(M, PSI, GetBFI, GetTTI) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +struct CodeSizeOutlinerLegacyPass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + CodeSizeOutlinerLegacyPass() : ModulePass(ID) { + initializeCodeSizeOutlinerLegacyPassPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; + TargetTransformInfoWrapperPass *TTIWP = + &getAnalysis(); + DenseMap TTIMap; + std::function GetTTI = + [&](Function &F) -> TargetTransformInfo & { + auto TTIIt = TTIMap.find(&F); + if (TTIIt == TTIMap.end()) + TTIIt = TTIMap.try_emplace(&F, std::move(TTIWP->getTTI(F))).first; + return TTIIt->second; + }; + std::unique_ptr CurBFI; + std::function GetBFI = + [&](Function &F) -> BlockFrequencyInfo & { + DominatorTree DT(F); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(F, LI); + CurBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); + return *CurBFI.get(); + }; + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + return runImpl(M, PSI, GetBFI, GetTTI); + } +}; + +char CodeSizeOutlinerLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(CodeSizeOutlinerLegacyPass, DEBUG_TYPE, + "Code Size Outliner", false, false) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(CodeSizeOutlinerLegacyPass, DEBUG_TYPE, + "Code Size Outliner", false, false) +ModulePass *llvm::createCodeSizeOutlinerPass() { + return new CodeSizeOutlinerLegacyPass(); +} Index: lib/Transforms/IPO/IPO.cpp =================================================================== --- lib/Transforms/IPO/IPO.cpp +++ lib/Transforms/IPO/IPO.cpp @@ -27,6 +27,7 @@ initializeArgPromotionPass(Registry); initializeCalledValuePropagationLegacyPassPass(Registry); initializeConstantMergeLegacyPassPass(Registry); + initializeCodeSizeOutlinerLegacyPassPass(Registry); initializeCrossDSOCFIPass(Registry); initializeDAEPass(Registry); initializeDAHPass(Registry); Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -160,6 +160,14 @@ EnableCHR("enable-chr", cl::init(true), cl::Hidden, cl::desc("Enable control height reduction optimization (CHR)")); +static cl::opt EnableCSO( + "enable-cso", cl::init(false), cl::Hidden, + cl::desc("Enable outlining for code size (default = off)")); + +static cl::opt EnableEarlyCSO( + "enable-early-cso", cl::init(false), cl::Hidden, + cl::desc("Enable an early run of the code size outliner pass (default = off)")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -524,6 +532,13 @@ if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) addPGOInstrPasses(MPM); + // Add an early run of the code size outliner pass. + if (EnableEarlyCSO && SizeLevel > 0) { + MPM.add(createSeparateConstOffsetFromGEPPass()); + MPM.add(createCodeSizeOutlinerPass()); + MPM.add(createPostOrderFunctionAttrsLegacyPass()); + } + // We add a module alias analysis pass here. In part due to bugs in the // analysis infrastructure this "works" in that the analysis stays alive // for the entire SCC pass run below. @@ -716,6 +731,13 @@ MPM.add(createConstantMergePass()); // Merge dup global constants } + // Add a late run of the code size outliner pass. + if (EnableCSO && SizeLevel > 0) { + MPM.add(createCodeSizeOutlinerPass()); + MPM.add(createTailCallEliminationPass()); + MPM.add(createPostOrderFunctionAttrsLegacyPass()); + } + if (MergeFunctions) MPM.add(createMergeFunctionsPass()); @@ -898,6 +920,9 @@ void PassManagerBuilder::addLateLTOOptimizationPasses( legacy::PassManagerBase &PM) { + // Add a late run of the code size outliner pass. + PM.add(createCodeSizeOutlinerPass()); + // Delete basic blocks, which optimization passes may have killed. PM.add(createCFGSimplificationPass()); Index: lib/Transforms/Utils/CMakeLists.txt =================================================================== --- lib/Transforms/Utils/CMakeLists.txt +++ lib/Transforms/Utils/CMakeLists.txt @@ -41,6 +41,7 @@ MetaRenamer.cpp ModuleUtils.cpp NameAnonGlobals.cpp + Outliner.cpp PredicateInfo.cpp PromoteMemoryToRegister.cpp StripGCRelocates.cpp Index: lib/Transforms/Utils/Outliner.cpp =================================================================== --- /dev/null +++ lib/Transforms/Utils/Outliner.cpp @@ -0,0 +1,838 @@ +//===- Outliner.cpp - Generic outliner interface -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements an interface for outlining congruent sequences. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Outliner.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PriorityQueue.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" + +using namespace llvm; + +static cl::opt AllowODR( + "cso-allow-odr", cl::init(false), cl::Hidden, + cl::desc("Allow outlining from linkonce_odr and weak_odr functions.")); + +static cl::opt DumpCC( + "cso-dump-cc", cl::init(false), cl::Hidden, + cl::desc("Dump information about the congruent between instructions.")); + +// Suffix array implementation. +namespace { +/// \brief Compute the suffix array. +// Basic adapted implementation of SA-IS algorithm. +class SuffixArray { +public: + // Compute the suffix array of /p S with given alphabet size /p AlphabetSize + // and store the result in /p SA + static std::vector compute(std::vector &S) { + // REQUIRED: N-1 must be 0 to act as a sentinel for the suffix array + // algorithm. + S.push_back(0); + + // Get the alphabet size of the unsigned vector. + unsigned AlphabetSize = 0; + for (unsigned CC : S) + if (CC > AlphabetSize) + AlphabetSize = CC; + ++AlphabetSize; + + SuffixArray SACtr(S.size()); + SACtr.computeSAIS(ArrayRef(S), S.size(), AlphabetSize); + return std::move(SACtr.SA); + } + +private: + SuffixArray(size_t ArraySize) { SA.resize(ArraySize); } + + template + void computeSAIS(ArrayRef S, int N, unsigned AlphabetSize) { + // Bitvector for LS-type array. + BitVector LTypeArray(N); + + // Classify each character from S as either LType or SType. + LTypeArray.set(N - 1); + for (int i = N - 3, e = 0; i >= e; --i) { + // S(i) is type S iff: S(i) < S(i+1) or S(i)==S(i+1) and S(i+1) is type + // S + if (S[i] < S[i + 1] || (S[i] == S[i + 1] && LTypeArray.test(i + 1))) + LTypeArray.set(i); + } + + // Stage 1: Reduce the problem and bucket sort all S-substrings. + Bucket.resize(AlphabetSize + 1); + /// Get the bucket ends. + getBuckets(S, true, N, AlphabetSize); + for (int i = 0; i < N; ++i) + SA[i] = -1; + for (int i = 1; i < N; ++i) + if (isLMS(i, LTypeArray)) + SA[--Bucket[S[i]]] = i; + induceSA(S, LTypeArray, N, AlphabetSize); + Bucket.clear(); + + /// Compact the sorted substrings into the first N1 items of the suffix + /// array. + int N1 = 0; + for (int i = 0; i < N; ++i) + if (isLMS(SA[i], LTypeArray)) + SA[N1++] = SA[i]; + + /// Find the lexicographic names of the substrings. + for (int i = N1; i < N; ++i) + SA[i] = -1; + int Name = 0, Prev = -1; + for (int i = 0; i < N1; ++i) { + int Pos = SA[i]; + for (int d = 0; d < N; ++d) { + if (Prev == -1 || S[Pos + d] != S[Prev + d] || + LTypeArray.test(Pos + d) != LTypeArray.test(Prev + d)) { + ++Name; + Prev = Pos; + break; + } + if (d > 0 && + (isLMS(Pos + d, LTypeArray) || isLMS(Prev + d, LTypeArray))) + break; + } + Pos = (Pos % 2 == 0) ? Pos / 2 : (Pos - 1) / 2; + SA[N1 + Pos] = Name - 1; + } + for (int i = N - 1, j = i; i >= N1; --i) + if (SA[i] >= 0) + SA[j--] = SA[i]; + + // Stage 2: Solve the reduced problem. + /// If the names aren't unique enough yet, we recurse until they are. + size_t S1Start = N - N1; + int *S1 = SA.data() + S1Start; + if (Name < N1) + computeSAIS(ArrayRef(S1, N1), N1, Name - 1); + // Otherwise we can compute the suffix array directly. + else { + for (int i = 0; i < N1; ++i) + SA[S1[i]] = i; + } + + // Stage 3: Induce the result from the reduced solution. + Bucket.resize(AlphabetSize + 1); + /// Place the LMS characters into their respective buckets. + getBuckets(S, true, N, AlphabetSize); + /// Get P1. + for (int i = 1, j = 0; i < N; ++i) + if (isLMS(i, LTypeArray)) + S1[j++] = i; + /// Get the index in S. + for (int i = 0; i < N1; ++i) + SA[i] = S1[SA[i]]; + /// Initialize the suffix array from N1 to N - 1. + for (int i = N1; i < N; ++i) + SA[i] = -1; + for (int i = N1 - 1; i >= 0; --i) { + int j = SA[i]; + SA[i] = -1; + SA[--Bucket[S[j]]] = j; + } + induceSA(S, LTypeArray, N, AlphabetSize); + } + + // Check to see if S(Idx) is a left most S-type character. + bool isLMS(int Idx, BitVector <ypeArray) { + return Idx > 0 && LTypeArray.test(Idx) && !LTypeArray.test(Idx - 1); + } + template + void getBuckets(ArrayRef S, bool End, unsigned N, unsigned AlphabetSize) { + /// Clear buckets. + Bucket.assign(AlphabetSize + 1, 0); + /// Compute the size of each bucket. + for (size_t i = 0, e = S.size(); i < e; ++i) + ++Bucket[S[i]]; + int Sum = 0; + if (!End) { + for (size_t i = 0, e = AlphabetSize + 1; i < e; ++i) { + Sum += Bucket[i]; + Bucket[i] = Sum - Bucket[i]; + } + } else + for (size_t i = 0; i <= AlphabetSize; ++i) + Bucket[i] = Sum += Bucket[i]; + } + + // Compute SA1 + template + void induceSA(ArrayRef S, BitVector <ypeArray, unsigned N, + unsigned AlphabetSize) { + // Induce SA1 + getBuckets(S, false, N, AlphabetSize); + for (size_t i = 0; i < N; ++i) { + int j = SA[i] - 1; + if (j >= 0 && !LTypeArray.test(j)) + SA[Bucket[S[j]]++] = j; + } + // Induce Sas + getBuckets(S, true, N, AlphabetSize); + for (ssize_t i = N - 1; i >= 0; --i) { + int j = SA[i] - 1; + if (j >= 0 && LTypeArray.test(j)) + SA[--Bucket[S[j]]] = j; + } + } + std::vector SA; + std::vector Bucket; +}; +// Construct the LCP array for a given suffix array /p SA and string /p S. +static std::vector computeLCP(ArrayRef S, ArrayRef SA) { + int N = S.size(); + std::vector LCP(N), Rank(N); + for (int i = 0; i < N; ++i) + Rank[SA[i]] = i; + for (int i = 0, k = 0; i < N; ++i) { + if (Rank[i] == N - 1) { + k = 0; + continue; + } + int j = SA[Rank[i] + 1]; + while (i + k < N && j + k < N && S[i + k] == S[j + k]) + ++k; + LCP[Rank[i]] = k; + if (k > 0) + --k; + } + return LCP; +} +} // namespace + +// Candidate Selection. +std::vector llvm::findSequentialOutliningCandidates( + function_ref, unsigned)> PrePruneFn, + std::vector &Vec, unsigned MinInstructionLen, + unsigned MinOccurrences) { + std::vector CL; + + // Build the suffix array and longest common prefix array. + std::vector SuffixArr = SuffixArray::compute(Vec); + std::vector LcpArr = computeLCP(Vec, SuffixArr); + + // An interval tree of our current candidates. + BitVector CandidateInterval(Vec.size()); + + // Walk the suffix array to build potential candidates. + SmallDenseSet FailedOccurrences; + size_t PrevSize = 0; + std::vector Occurrences; + for (size_t i = 1, e = SuffixArr.size(); i < e; ++i) { + size_t Size = LcpArr[i]; + + // Preskip invalid size. + if (Size < MinInstructionLen) { + PrevSize = 0; + continue; + } + + size_t OccurIdx = SuffixArr[i]; + + // We have already matched against this size. + if (PrevSize >= Size) { + PrevSize = Size; + continue; + } + + // Create a new interval tree with our current candidate to pre prune + // overlaps. + Occurrences.clear(); + CandidateInterval.set(OccurIdx, OccurIdx + Size); + Occurrences.push_back(OccurIdx); + FailedOccurrences.clear(); + bool HasPreviousSharedOccurrence = false; + + // Continuously consider potentital chain sizes for this candidate until + // they are no longer profitable. + size_t OrigSize = Size, LastValidSize = 0; + for (size_t SizeFromIdx = i, AugmentAmount = 0; + Size >= MinInstructionLen;) { + bool AddedNewOccurrence = false; + + // Augment the candidate set by the change in size from the + // last iteration. + if (AugmentAmount > 0) + for (size_t Idx : Occurrences) + CandidateInterval.reset(Idx + Size, Idx + Size + AugmentAmount); + LastValidSize = Size; + + // After augmenting the candidate set, there may be new candidates that + // no longer overlap with any of the others currently being considered. + for (auto It = FailedOccurrences.begin(), E = FailedOccurrences.end(); + It != E;) { + size_t Idx = *It; + ++It; + if (CandidateInterval.find_first_in(Idx, Idx + Size) != -1) + continue; + FailedOccurrences.erase(Idx); + CandidateInterval.set(Idx, Idx + Size); + Occurrences.push_back(Idx); + AddedNewOccurrence = true; + } + + // Count the number of occurrences. + for (size_t j = i + Occurrences.size(); j < e; ++j) { + // The longest common prefix must be able to hold our size. + if ((size_t)LcpArr[j - 1] < Size) + break; + + // Check to see if this candidate overlaps with any of our currently + // considered candidates. If it doesn't we add it to our current set. + size_t JIdx = SuffixArr[j]; + if (CandidateInterval.find_first_in(JIdx, JIdx + Size) == -1) { + CandidateInterval.set(JIdx, JIdx + Size); + Occurrences.push_back(JIdx); + AddedNewOccurrence = true; + } else + FailedOccurrences.insert(JIdx); + + // If our next size is less than the current, we won't get any more + // candidates for this chain. + if ((size_t)LcpArr[j] < Size) + break; + } + + // If we added a new candidate and we have enough to satisfy our + // constraints then we build a new outline chain candidate. + if (AddedNewOccurrence && Occurrences.size() >= MinOccurrences) { + // Recheck the prune size each iteration. + if (!PrePruneFn || !PrePruneFn(Occurrences, Size)) { + /// Cache shared sizes between candidates chains to make analysis + /// easier. + if (HasPreviousSharedOccurrence) + CL.back().SharedSizeWithNext = Size; + else + HasPreviousSharedOccurrence = true; + /// Build new function with candidate sequence. + CL.emplace_back(CL.size(), Size, Occurrences); + } + } + + // Find the next size to consider for this candidate. + for (size_t NewSizeE = e - 1; ++SizeFromIdx < NewSizeE;) { + size_t NewSize = static_cast(LcpArr[SizeFromIdx]); + if (NewSize < Size) { + AugmentAmount = Size - NewSize; + Size = NewSize; + break; + } + } + + // If we have already encountered a greater size, then the new size + // was either invalid or we've already considered this size but + // with more candidates. + if (Size == LastValidSize || Size <= PrevSize) + break; + } + for (unsigned Idx : Occurrences) + CandidateInterval.reset(Idx, Idx + LastValidSize); + PrevSize = OrigSize; + } + return CL; +} + +// Candidate Pruning. +bool llvm::pruneSequentialOutlineCandidateList( + MutableArrayRef CL, unsigned NumMappedInstructions) { + // Helper comparator for candidate indexes. + struct Comparator { + Comparator(ArrayRef CL) : CL(CL) {} + bool operator()(unsigned L, unsigned R) { + return CL[L].Benefit < CL[R].Benefit; + } + ArrayRef CL; + }; + + // Build a priority worklist for the valid candidates. + std::vector OriginalSetOfValidCands; + OriginalSetOfValidCands.reserve(CL.size()); + for (unsigned i = 0, e = CL.size(); i < e; ++i) + if (CL[i].isValid()) + OriginalSetOfValidCands.push_back(i); + + // Priority queue of the most profitable candidates. + Comparator C(CL); + PriorityQueue, Comparator> MostBenefitial( + C, OriginalSetOfValidCands); + + // BitVector of all of the occurrence instructions that have been tagged + // as profitable to outline. We use this to prune less beneficial + // occurrences. + BitVector InsertedOccurrences(NumMappedInstructions); + bool HasProfitableCandidate = false; + while (!MostBenefitial.empty()) { + // Get the next most benefical candidate. + unsigned CandIdx = MostBenefitial.top(); + MostBenefitial.pop(); + OutlineCandidate &OC = CL[CandIdx]; + + // Check overlaps. + bool PrunedAnOccurrence = false; + for (ssize_t i = OC.size() - 1; i >= 0; --i) { + unsigned Occur = OC.getOccurrence(i); + if (InsertedOccurrences.find_first_in(Occur, Occur + OC.Len) == -1) + continue; + if (OC.Benefit < OC.BenefitPerOccur) { + OC.invalidate(); + break; + } + OC.Benefit -= OC.BenefitPerOccur; + OC.removeOccurrence(i); + PrunedAnOccurrence = true; + } + + // Add valid occurrences if this candidate is still profitable. + if (!OC.isValid()) + continue; + + // If we pruned an occurrences, we readd the candidate + // so that we can get the candidate with the next highest benefit. + if (PrunedAnOccurrence) { + MostBenefitial.push(CandIdx); + continue; + } + + // Tag our valid occurrences. + for (unsigned Occur : OC) + InsertedOccurrences.set(Occur, Occur + OC.Len); + HasProfitableCandidate = true; + } + return HasProfitableCandidate; +} + +/// Mapping an IR module for outlining. + +/// \brief Expression with relaxed equivalency constraints. +class RelaxedExpression { +public: + /// A special state for special equivalence constraints. + enum SpecialState { StructGep, ConstrainedCall, None }; + + RelaxedExpression(Instruction &I) + : Inst(&I), SS(SpecialState::None), HashVal(0) { + // Check the special state. + CallSite CS(Inst); + if (CS) { + /// Don't take the address of inline asm calls. + if (CS.isInlineAsm() || CS.isInvoke()) + SS = RelaxedExpression::ConstrainedCall; + /// Intrinsics and functions without exact definitions can not + /// have their address taken. + else if (Function *F = CS.getCalledFunction()) { + if (!canTakeFnAddress(F)) + SS = RelaxedExpression::ConstrainedCall; + } + } + /// Struct geps require constant indices. + else if (GetElementPtrInst *GEP = dyn_cast(&I)) { + if (gepContainsStructType(GEP)) + SS = RelaxedExpression::StructGep; + } + } + bool equals(const RelaxedExpression &OE) const { + if (SS != OE.SS) + return false; + // Handle calls separately to allow for mismatched tail calls. + CallSite CS(Inst); + if (CS) { + CallSite RCS(OE.Inst); + if (CS.getNumArgOperands() != RCS.getNumArgOperands()) + return false; + if (CS.getCallingConv() != RCS.getCallingConv() || + CS.getAttributes() != RCS.getAttributes()) + return false; + if (CS.isInvoke()) { + InvokeInst *L = cast(Inst), *R = cast(OE.Inst); + if (!L->hasIdenticalOperandBundleSchema(*R)) + return false; + } else { + CallInst *L = cast(Inst), *R = cast(OE.Inst); + if (!L->hasIdenticalOperandBundleSchema(*R)) + return false; + } + } else if (isa(Inst)) + return Inst->getType() == OE.Inst->getType(); + else if (!Inst->isSameOperationAs(OE.Inst, + Instruction::CompareIgnoringAlignment)) + return false; + return checkSpecialEquivalence(OE); + } + hash_code getComputedHash() const { + if (static_cast(HashVal) == 0) + HashVal = getHashValue(); + return HashVal; + } + hash_code getHashValue() const { + SmallVector HashRange; + HashRange.push_back(SS); + HashRange.push_back(Inst->getNumOperands()); + HashRange.push_back(reinterpret_cast(Inst->getType())); + if (!isa(Inst)) + for (auto &Op : Inst->operands()) + HashRange.emplace_back(reinterpret_cast(Op->getType())); + return hash_combine_range(HashRange.begin(), HashRange.end()); + } + +private: + // Check to see if we can take the address of a function. + bool canTakeFnAddress(Function *F) const { + // A function marked 'alwaysinline' with available_externally linkage can't + // have its address taken. Doing so would create an undefined external ref + // to the function, which would fail to link. + bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage(); + if (HasAvailableExternallyLinkage && + F->hasFnAttribute(Attribute::AlwaysInline)) + return false; + return F->hasExactDefinition() || F->hasAddressTaken() || + F->hasLinkOnceLinkage(); + } + // Special checks for instructions that have non generic equivalency. + bool checkSpecialEquivalence(const RelaxedExpression &Other) const { + const Instruction *OE = Other.Inst; + switch (Inst->getOpcode()) { + case Instruction::ShuffleVector: { + // LangRef : The shuffle mask operand is required to be a + // constant vector with either constant integer or undef values. + return Inst->getOperand(2) == OE->getOperand(2); + } + case Instruction::Call: + case Instruction::Invoke: { + CallSite LCS(Inst); + CallSite RCS(const_cast(OE)); + if (SS == ConstrainedCall) + return checkConstrainedCallEquivalence(LCS, RCS); + return LCS.getFunctionType() == RCS.getFunctionType(); + } + case Instruction::GetElementPtr: { + // Struct indices must be constant. + if (SS == StructGep) + return compareStructIndices(cast(Inst), + cast(OE)); + break; + } + default: + break; + } + return true; + } + bool checkConstrainedCallEquivalence(const CallSite &LCS, + const CallSite &RCS) const { + const Value *CIVal = LCS.getCalledValue(); + if (CIVal != RCS.getCalledValue()) + return false; + if (const Function *CalledII = dyn_cast(CIVal)) { + switch (CalledII->getIntrinsicID()) { + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::memset: + /// Size. + return LCS.getArgOperand(2) == RCS.getArgOperand(2) && + /// Volatile flag. + LCS.getArgOperand(3) == RCS.getArgOperand(3); + case Intrinsic::objectsize: + /// Min. + return LCS.getArgOperand(1) == RCS.getArgOperand(1) && + /// Null unknown. + LCS.getArgOperand(2) == RCS.getArgOperand(2); + case Intrinsic::expect: + /// Expected value. + return LCS.getArgOperand(1) == RCS.getArgOperand(1); + case Intrinsic::prefetch: + /// RW. + return LCS.getArgOperand(1) == RCS.getArgOperand(1) && + /// Locality. + LCS.getArgOperand(2) == RCS.getArgOperand(2) && + /// Cache Type. + LCS.getArgOperand(3) == RCS.getArgOperand(3); + case Intrinsic::ctlz: + case Intrinsic::cttz: + /// Is Zero Undef. + return LCS.getArgOperand(1) == RCS.getArgOperand(1); + case Intrinsic::invariant_start: + // Size. + return LCS.getArgOperand(0) == RCS.getArgOperand(0); + default: + break; + } + } + return true; + } + bool compareStructIndices(GetElementPtrInst *L, + const GetElementPtrInst *R) const { + gep_type_iterator LIt = gep_type_begin(L), LE = gep_type_end(L); + gep_type_iterator RIt = gep_type_begin(R); + for (; LIt != LE; ++LIt, ++RIt) { + if (LIt.isStruct() && LIt.getOperand() != RIt.getOperand()) + return false; + } + return true; + } + // Check to see if the provided gep /p GEP indexes a struct type. + bool gepContainsStructType(GetElementPtrInst *GEP) { + gep_type_iterator It = gep_type_begin(GEP), E = gep_type_end(GEP); + for (; It != E; ++It) + if (It.isStruct()) + return true; + return false; + } + + Instruction *Inst; + SpecialState SS; + mutable hash_code HashVal; +}; + +// DenseMapInfo for the relaxed expression class. +template <> struct llvm::DenseMapInfo { + static const RelaxedExpression *getEmptyKey() { + auto Val = static_cast(-1); + Val <<= + PointerLikeTypeTraits::NumLowBitsAvailable; + return reinterpret_cast(Val); + } + static const RelaxedExpression *getTombstoneKey() { + auto Val = static_cast(~1U); + Val <<= + PointerLikeTypeTraits::NumLowBitsAvailable; + return reinterpret_cast(Val); + } + static unsigned getHashValue(const RelaxedExpression *E) { + return E->getComputedHash(); + } + static bool isEqual(const RelaxedExpression *LHS, + const RelaxedExpression *RHS) { + if (LHS == RHS) + return true; + if (LHS == getTombstoneKey() || RHS == getTombstoneKey() || + LHS == getEmptyKey() || RHS == getEmptyKey()) + return false; + if (LHS->getComputedHash() != RHS->getComputedHash()) + return false; + return LHS->equals(*RHS); + } +}; + +// How an instruction is mapped. +enum class IRInstructionMapType { Invalid, Ignored, Valid }; + +// Checks to see if a provided instruction /p I can be mapped. +IRInstructionMapType getInstrMapType(Instruction *I) { + CallSite CS(I); + if (CS) { + // Be very conservative about musttail because it has additional + // guarantees that must be met. + if (CS.isMustTailCall()) + return IRInstructionMapType::Invalid; + // Be conservative about return twice and inalloca calls. + if (CS.hasFnAttr(Attribute::ReturnsTwice) || CS.hasInAllocaArgument()) + return IRInstructionMapType::Invalid; + switch (CS.getIntrinsicID()) { + // We ignore debug intrinsics. + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + // As well as lifetime instructions. + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return IRInstructionMapType::Ignored; + // Misc intrinsics that we handle. + case Intrinsic::objectsize: + case Intrinsic::expect: + case Intrinsic::prefetch: + case Intrinsic::trap: + // Invariants. + case Intrinsic::invariant_start: + // Overflow intrinsics. + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + // Lib C functions are fine. + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: + case Intrinsic::sqrt: + case Intrinsic::pow: + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::fma: + case Intrinsic::fabs: + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + // Bit manipulation intrinsics. + case Intrinsic::bitreverse: + case Intrinsic::bswap: + case Intrinsic::ctpop: + case Intrinsic::ctlz: + case Intrinsic::cttz: + // Non intrinsics are fine. + case Intrinsic::not_intrinsic: + return IRInstructionMapType::Valid; + default: + return IRInstructionMapType::Invalid; + } + return IRInstructionMapType::Valid; + } + + // We don't take allocas from the parent function. + // PHINodes don't cost anything and will simply become inputs. + // Terminators will exist in both so there is no benefit from outlining + // them. + if (isa(I) || isa(I) || I->isTerminator()) + return IRInstructionMapType::Invalid; + // Don't take any exception handling instructions. + if (I->isEHPad()) + return IRInstructionMapType::Invalid; + return IRInstructionMapType::Valid; +} + +std::vector +llvm::mapIRModule(OutlinerMapper &OM, Module &M, ProfileSummaryInfo *PSI, + function_ref GetBFI) { + bool HasProfileData = PSI->hasProfileSummary(); + std::vector CCVec; + unsigned CCID = 1, IllegalID = UINT_MAX; + + /// Memory management for the GVN expressions used for congruency. + SpecificBumpPtrAllocator ExpressionAllocator; + + /// Mapping of expression to congruency id. + using GlobalCongruencyMapTy = + std::array, + Instruction::OtherOpsEnd>; + GlobalCongruencyMapTy GlobalCC; + + // Insert illegal ID at the front to act as a sentinel. + OM.mapInstr(nullptr); + CCVec.push_back(IllegalID--); + + bool LastWasInvalid = true; + for (Function &F : M) { + // Don't allow non exact definitions. + if (!F.hasExactDefinition()) + if (!AllowODR || (!F.hasLinkOnceODRLinkage() && !F.hasWeakODRLinkage())) + continue; + // Be conservative about unknown GC types. + if (F.hasGC()) + continue; + // Don't consider dead or optnone. + if (F.hasFnAttribute(Attribute::OptimizeNone) || F.isDefTriviallyDead()) + continue; + + // Don't outline non -Oz functions without profile data. + bool FavorSize = F.optForMinSize(); + if (!FavorSize && !HasProfileData) + continue; + BlockFrequencyInfo *BFI = HasProfileData ? &GetBFI(F) : nullptr; + for (BasicBlock &BB : F) { + // Skip hot blocks if we have profile data. + if (HasProfileData) + if (FavorSize ? PSI->isHotBB(&BB, BFI) : !PSI->isColdBB(&BB, BFI)) + continue; + + // Helper for adding an invalid instruction. + auto EmitInvalidInst = [&]() { + LastWasInvalid = true; + OM.mapInstr(nullptr); + CCVec.push_back(IllegalID--); + }; + + // Try to map each instruction to a congruency id. + for (Instruction &I : BB) { + switch (getInstrMapType(&I)) { + // Skip invalid/ignored instructions. + case IRInstructionMapType::Invalid: + if (!LastWasInvalid) + EmitInvalidInst(); + LLVM_FALLTHROUGH; + case IRInstructionMapType::Ignored: + continue; + case IRInstructionMapType::Valid: + break; + } + OM.mapInstr(&I); + + // We map each valid instruction to a Relaxed expression and use + // this for detecting congruency. + RelaxedExpression *E = + new (ExpressionAllocator.Allocate()) RelaxedExpression(I); + auto ItPair = GlobalCC[I.getOpcode()].try_emplace(E, CCID); + if (ItPair.second) + ++CCID; + // Use the mapped congruency ID. + CCVec.push_back(ItPair.first->second); + LastWasInvalid = false; + + // We need a barrier after a valid terminator. + if (isa(&I)) + EmitInvalidInst(); + } + } + } + + // Dump the mapped congruencies found for the module /p M. + if (DumpCC) { +#ifndef NDEBUG + for (Function &F : M) { + dbgs() << "function : " << F.getName() << "\n"; + for (auto &BB : F) { + dbgs() << "block : " << BB.getName() << "\n"; + for (Instruction &I : BB) { + unsigned Idx = OM.getInstrIdx(&I); + unsigned CCID = Idx == unsigned(-1) ? -1 : CCVec[Idx]; + dbgs() << "-- " << CCID << " : " << I << '\n'; + } + } + } + for (auto &CCIt : GlobalCC) { + for (auto CC : CCIt) { + size_t Idx = CC.second; + dbgs() << "-- Examining CC ID : " << Idx << "\n"; + for (size_t i = 0, e = CCVec.size(); i < e; ++i) + if (CCVec[i] == Idx) + dbgs() << " - " << *OM.getInstr(i) << "\n"; + dbgs() << "\n"; + } + } +#endif + } + + // We regroup the illegal indices so that our alphabet is of a defined size. + unsigned Diff = (UINT_MAX - IllegalID); + for (unsigned &InstId : CCVec) + if (InstId > IllegalID) + InstId = 1 + (UINT_MAX - InstId); + else + InstId += Diff; + return CCVec; +}