Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -116,6 +116,7 @@ void initializeExpandPostRAPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry &); void initializeGCOVProfilerPass(PassRegistry&); +void initializePGOLateInstrumentationPass(PassRegistry&); void initializeInstrProfilingPass(PassRegistry&); void initializeAddressSanitizerPass(PassRegistry&); void initializeAddressSanitizerModulePass(PassRegistry&); Index: include/llvm/LinkAllPasses.h =================================================================== --- include/llvm/LinkAllPasses.h +++ include/llvm/LinkAllPasses.h @@ -85,6 +85,7 @@ (void) llvm::createDomOnlyViewerPass(); (void) llvm::createDomViewerPass(); (void) llvm::createGCOVProfilerPass(); + (void) llvm::createPGOLateInstrumentationPass(); (void) llvm::createInstrProfilingPass(); (void) llvm::createFunctionInliningPass(); (void) llvm::createAlwaysInlinerPass(); Index: include/llvm/ProfileData/InstrProfWriter.h =================================================================== --- include/llvm/ProfileData/InstrProfWriter.h +++ include/llvm/ProfileData/InstrProfWriter.h @@ -41,7 +41,8 @@ /// summed. std::error_code addFunctionCounts(StringRef FunctionName, uint64_t FunctionHash, - ArrayRef Counters); + ArrayRef Counters, + bool IsLateInstr = false); /// Write the profile to \c OS void write(raw_fd_ostream &OS); /// Write the profile, returning the raw data. For testing. Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -147,6 +147,7 @@ void addInitialAliasAnalysisPasses(legacy::PassManagerBase &PM) const; void addLTOOptimizationPasses(legacy::PassManagerBase &PM); void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); + void addPGOLateInstrPasses(legacy::PassManagerBase &MPM); public: /// populateFunctionPassManager - This fills in the function pass manager, Index: include/llvm/Transforms/Instrumentation.h =================================================================== --- include/llvm/Transforms/Instrumentation.h +++ include/llvm/Transforms/Instrumentation.h @@ -77,6 +77,10 @@ ModulePass *createGCOVProfilerPass(const GCOVOptions &Options = GCOVOptions::getDefault()); +// PGO Late Instrumention +ModulePass *createPGOLateInstrumentationPass(StringRef Filename = + StringRef("")); + /// Options for the frontend instrumentation based profiling pass. struct InstrProfOptions { InstrProfOptions() : NoRedZone(false) {} Index: lib/ProfileData/InstrProfWriter.cpp =================================================================== --- lib/ProfileData/InstrProfWriter.cpp +++ lib/ProfileData/InstrProfWriter.cpp @@ -74,7 +74,8 @@ std::error_code InstrProfWriter::addFunctionCounts(StringRef FunctionName, uint64_t FunctionHash, - ArrayRef Counters) { + ArrayRef Counters, + bool IsLateInstr) { auto &CounterData = FunctionData[FunctionName]; auto Where = CounterData.find(FunctionHash); @@ -82,8 +83,15 @@ // We've never seen a function with this name and hash, add it. CounterData[FunctionHash] = Counters; // We keep track of the max function count as we go for simplicity. - if (Counters[0] > MaxFunctionCount) - MaxFunctionCount = Counters[0]; + if (!IsLateInstr) { + if (Counters[0] > MaxFunctionCount) + MaxFunctionCount = Counters[0]; + } else { + for (size_t I = 0, E = Counters.size(); I < E; ++I) { + if (Counters[I] > MaxFunctionCount) + MaxFunctionCount = Counters[I]; + } + } return instrprof_error::success; } @@ -98,9 +106,11 @@ if (FoundCounters[I] + Counters[I] < FoundCounters[I]) return instrprof_error::counter_overflow; FoundCounters[I] += Counters[I]; + if (IsLateInstr && FoundCounters[I] > MaxFunctionCount) + MaxFunctionCount = FoundCounters[I]; } // We keep track of the max function count as we go for simplicity. - if (FoundCounters[0] > MaxFunctionCount) + if (!IsLateInstr && FoundCounters[0] > MaxFunctionCount) MaxFunctionCount = FoundCounters[0]; return instrprof_error::success; Index: lib/Transforms/IPO/LLVMBuild.txt =================================================================== --- lib/Transforms/IPO/LLVMBuild.txt +++ lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ name = IPO parent = Transforms library_name = ipo -required_libraries = Analysis Core InstCombine ProfileData Scalar Support TransformUtils Vectorize +required_libraries = Analysis Core InstCombine ProfileData Scalar Support TransformUtils Vectorize Instrumentation Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -32,6 +32,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Vectorize.h" +#include "llvm/Transforms/Instrumentation.h" using namespace llvm; @@ -99,6 +100,17 @@ cl::desc( "Enable the GlobalsModRef AliasAnalysis outside of the LTO pipeline.")); +static cl::opt +RunPGOLateInstrGen("pgo-late-instr-generate", cl::init(""), cl::Hidden, + cl::desc("Enable generate phrase of PGO Late instrumentation " + "and specify the path of profile data file")); + +static cl::opt +RunPGOLateInstrUse("pgo-late-instr-use", cl::init(""), cl::Hidden, + cl::value_desc("filename"), + cl::desc("Enable use phrase of PGO Late instrumentation " + " and specify the path of profile data file")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -179,11 +191,25 @@ FPM.add(createLowerExpectIntrinsicPass()); } +// Do PGO late instrumentation generate or use operation as optiton +// specified. +void PassManagerBuilder::addPGOLateInstrPasses(legacy::PassManagerBase &MPM) { + if (!RunPGOLateInstrGen.empty()) { + MPM.add(createPGOLateInstrumentationPass()); + InstrProfOptions Options; + Options.InstrProfileOutput = RunPGOLateInstrGen; + MPM.add(createInstrProfilingPass(Options)); + } + else if (!RunPGOLateInstrUse.empty()) + MPM.add(createPGOLateInstrumentationPass(RunPGOLateInstrUse)); +} + void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { // If all optimizations are disabled, just run the always-inline pass and, // if enabled, the function merging pass. if (OptLevel == 0) { + addPGOLateInstrPasses(MPM); if (Inliner) { MPM.add(Inliner); Inliner = nullptr; @@ -222,6 +248,8 @@ MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } + addPGOLateInstrPasses(MPM); + if (EnableNonLTOGlobalsModRef) // We add a module alias analysis pass here. In part due to bugs in the // analysis infrastructure this "works" in that the analysis stays alive Index: lib/Transforms/Instrumentation/CMakeLists.txt =================================================================== --- lib/Transforms/Instrumentation/CMakeLists.txt +++ lib/Transforms/Instrumentation/CMakeLists.txt @@ -6,6 +6,7 @@ MemorySanitizer.cpp Instrumentation.cpp InstrProfiling.cpp + PGOLateInstr.cpp SafeStack.cpp SanitizerCoverage.cpp ThreadSanitizer.cpp Index: lib/Transforms/Instrumentation/Instrumentation.cpp =================================================================== --- lib/Transforms/Instrumentation/Instrumentation.cpp +++ lib/Transforms/Instrumentation/Instrumentation.cpp @@ -60,6 +60,7 @@ initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingPass(Registry); initializeGCOVProfilerPass(Registry); + initializePGOLateInstrumentationPass(Registry); initializeInstrProfilingPass(Registry); initializeMemorySanitizerPass(Registry); initializeThreadSanitizerPass(Registry); Index: lib/Transforms/Instrumentation/LLVMBuild.txt =================================================================== --- lib/Transforms/Instrumentation/LLVMBuild.txt +++ lib/Transforms/Instrumentation/LLVMBuild.txt @@ -19,4 +19,4 @@ type = Library name = Instrumentation parent = Transforms -required_libraries = Analysis Core MC Support TransformUtils +required_libraries = Analysis Core MC Support TransformUtils ProfileData Index: lib/Transforms/Instrumentation/PGOLateInstr.cpp =================================================================== --- /dev/null +++ lib/Transforms/Instrumentation/PGOLateInstr.cpp @@ -0,0 +1,868 @@ +//===- PGOLateInstru.cpp - Late Instrumentation for PGO --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements late instrumentation for PGO. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/Pass.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ProfileData/InstrProfReader.h" +#include "llvm/Analysis/CFG.h" +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "pgo-late-instr" + +STATISTIC(NumOfLatePGOInstrument, "Number of edges instrumented."); +STATISTIC(NumOfLatePGOEdge, "Number of edges."); +STATISTIC(NumOfLatePGOBB, "Number of basic-blocks."); +STATISTIC(NumOfLatePGOSplit, "Number of critical edge splits."); + +namespace { +class PGOLateInstrumentation : public ModulePass { +public: + static char ID; + + // For profile use, provide the profile filename as the parameter. + // If the parameter is an empty string, it is for the profile generate + // compilation. + PGOLateInstrumentation(StringRef Filename = StringRef("")) + : ModulePass(ID), ProfileFileName(Filename) { + initializePGOLateInstrumentationPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "PGOLateInstrmentationPass"; + } + +private: + StringRef ProfileFileName; + bool runOnModule(Module &M) override; +}; +} // end anonymous namespace + +char PGOLateInstrumentation::ID = 0; +INITIALIZE_PASS(PGOLateInstrumentation, "pgo-late-instr", + "Late (IR level) instrumentation for PGO.", false, false) + +ModulePass *llvm::createPGOLateInstrumentationPass(StringRef Filename) { + return new PGOLateInstrumentation(Filename); +} + +namespace llvm { + +/// \brief An MST based instrumentation for PGO +/// +/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO +/// in the function level. +class PGOLateInstrumentationFunc { +private: + Function &F; + Module *M; + std::string FuncName; + GlobalVariable *FuncNameVar; + + // If this for profile-use compilation. + bool IsUse; + + // CFG hash value for this function. + uint64_t FunctionHash; + + // The maximum count value in the profile. This is only used in PGO use + // compilation. + uint64_t ProgramMaxCount; + + // Threshold of the hot functions. + const float HotFunctionThreshold = 0.01; + + // Threshold of the cold functions. + const float ColdFunctionThreshold = 0.0002; + + // This class implements the CFG edges. Note the CFG can be a multi-graph. + struct Edge { + const BasicBlock *SrcBB; + const BasicBlock *DestBB; + unsigned Weight; + bool InMST; + bool Removed; + bool IsCritical; + bool CountValid; + uint64_t CountValue; + Edge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 1) + : SrcBB(Src), DestBB(Dest), Weight(W), InMST(false), Removed(false), + IsCritical(false), CountValid(false), CountValue(0) {} + }; + // Store all the edges in CFG. It may contains some stale edges + // (when Removed == true). + std::vector AllEdges; + + // A structure that uses to store the direct edges from BB. + // Make is a small vecdtor of 2 as most of BBs have 1 or 2 out in/out edges. + typedef SmallVector DirectEdges; + + struct CompareEdgeWeight { + bool operator()(const Edge *lhs, const Edge *rhs) const { + return lhs->Weight > rhs->Weight; + } + }; + + // This class stores the auxiliary information for each BB. + struct BBInfo { + // Below three members are for union-find algorithm. + BBInfo *Group; + unsigned Index; + unsigned Rank; + // This following memmbers are primarily for read and populate profile + // count in PGO use. It stores the profile count for each BB. + uint64_t CountValue; + bool CountValid; + uint32_t UnknownCountInEdge; + uint32_t UnknownCountOutEdge; + DirectEdges InEdges; + DirectEdges OutEdges; + BBInfo(unsigned IX) + : Group(this), Index(IX), Rank(0), CountValue(0), CountValid(false), + UnknownCountInEdge(0), UnknownCountOutEdge(0) {} + BBInfo(unsigned IX, uint64_t C) + : Group(this), Index(IX), Rank(0), CountValue(C), CountValid(true), + UnknownCountInEdge(0), UnknownCountOutEdge(0) {} + }; + DenseMap BBInfos; + + BBInfo *findAndCompressGroup(BBInfo *BB); + bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2); + void setFuncName(StringRef Name, GlobalValue::LinkageTypes Linkage); + void setFuncName(Function &Fn) { setFuncName(Fn.getName(), Fn.getLinkage()); } + void createFuncNameVar(GlobalValue::LinkageTypes Linkage); + uint64_t sumEdgeCount(const DirectEdges &Edges); + void setUnknownCountEdge(DirectEdges &Edges, uint64_t Value); + unsigned crc32UnsignedBits(unsigned Chksum, unsigned Value, unsigned Bits); + + BBInfo *getBBInfo(const BasicBlock *BB) const { + auto It = BBInfos.find(BB); + assert(It != BBInfos.end()); + return It->second; + } + + BBInfo *setBBCount(const BasicBlock *BB, uint64_t Value) { + auto It = BBInfos.find(BB); + assert(It != BBInfos.end()); + It->second->CountValue = Value; + It->second->CountValid = true; + return It->second; + } + + // Create CFG edges and the associated data structures. + void buildEdges(); + + // Sort the edges in AllEdges based on their weight. + void sortEdges(); + + // Compute the minimum spanning tree. + void computeMinimumSpanningTree(); + + // Calculate the CFG hash for this function. + void computeCFGHash(); + + // Print out edges for debug. + void dumpEdges(bool DumpCount = false) const; + + // Give an edge, find the BB that will be instrumented. + // Return NULL if no BB needs to be instrumented. + BasicBlock *getInstrBB(Edge *E); + + // Add the instrumentation intrinsic calls. + void instrumentCFG(); + + // Find the Instrumented BB and set the value. + void setInstrumentedCounts(const std::vector *CountFromProfile); + + // Read counts for the instrumented BB from profile. + bool readCounters(StringRef ProfileFileName); + + // Populate the counts for all BBs. + void populateCounters(); + + // Set the branch weights based on the count values. + void setBranchWeights(); + + // Add edge to AllEdges with with Weight. + Edge *addEdge(const BasicBlock *Src, const BasicBlock *Dest, unsigned W = 2) { + if (BBInfos.find(Src) == BBInfos.end()) { + unsigned Index = BBInfos.size(); + BBInfos.insert(std::make_pair(Src, new BBInfo(Index))); + } + if (BBInfos.find(Dest) == BBInfos.end()) { + unsigned Index = BBInfos.size(); + BBInfos.insert(std::make_pair(Dest, new BBInfo(Index))); + } + Edge *E = new Edge(Src, Dest, W); + AllEdges.push_back(E); + return E; + } + + // Set edge count value + void setEdgeCount(Edge *E, uint64_t Value) { + E->CountValue = Value; + E->CountValid = true; + } + + // Set the hot/cold inline hints based on the count values. + void applyFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { + if (ProgramMaxCount == 0) + return; + if (EntryCount >= + static_cast(HotFunctionThreshold * + static_cast(ProgramMaxCount))) + F.addFnAttr(llvm::Attribute::InlineHint); + else if (MaxCount <= + static_cast(ColdFunctionThreshold * + static_cast(ProgramMaxCount))) + F.addFnAttr(llvm::Attribute::Cold); + } + +public: + PGOLateInstrumentationFunc(Function &Func, Module *Modu, StringRef Filename) + : F(Func), M(Modu), FunctionHash(0), ProgramMaxCount(0) { + IsUse = !Filename.empty(); + setFuncName(F); + buildEdges(); + sortEdges(); + computeMinimumSpanningTree(); + computeCFGHash(); + dumpEdges(); + + if (!IsUse) { + instrumentCFG(); + return; + } + + if (readCounters(Filename)) { + populateCounters(); + setBranchWeights(); + dumpEdges(true); + } + }; + + ~PGOLateInstrumentationFunc(); +}; + +PGOLateInstrumentationFunc::~PGOLateInstrumentationFunc() { + // Free Edge in AllEdges. + for (auto &EI : AllEdges) + delete EI; + + // Free BBInfo in BBInfos. + for (auto &BBGI : BBInfos) + delete BBGI.second; +} + +// Compute the CRC32 of unsigned integers. +unsigned PGOLateInstrumentationFunc::crc32UnsignedBits(unsigned Chksum, + unsigned Value, + unsigned Bits) { + for (unsigned i = Bits; i--; Value <<= 1) { + unsigned Feedback = (Value ^ Chksum) & 0x80000000 ? 0x04c11db7 : 0; + Chksum <<= 1; + Chksum ^= Feedback; + } + return Chksum; +} + +// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index +// value of each BB in the CFG. The higher 32 bits record the number of edges. +void PGOLateInstrumentationFunc::computeCFGHash() { + uint32_t Chksum = 0; + for (auto &BB : F) { + const TerminatorInst *TI = BB.getTerminator(); + for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) { + BasicBlock *Succ = TI->getSuccessor(s); + unsigned Index = getBBInfo(Succ)->Index; + Chksum = crc32UnsignedBits(Chksum, Index, 32); + } + } + FunctionHash = AllEdges.size() << 32 | Chksum; +} + +// Traverse the CFG using a stack. Find all the edges and assign the weight. +// Edges with large weight will be put into MST first so they are less likely +// to be instrumented. +// Here we use a very simple heuristic to assign the edge weight. +// The best weight is the profile count itself. The next best way is +// probably to use the static profile. +void PGOLateInstrumentationFunc::buildEdges() { + DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n"); + + const BasicBlock *BB = &(F.getEntryBlock()); + // Add a fake edge to the entry. + addEdge(0, BB); + + // Special handling for single BB functions. + if (succ_empty(BB)) { + addEdge(BB, 0); + NumOfLatePGOBB += 1; + NumOfLatePGOEdge += 2; + return; + } + + SmallPtrSet Visited; + SmallVector, 8> VisitStack; + SmallPtrSet InStack; + + Visited.insert(BB); + VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); + InStack.insert(BB); + + do { + std::pair &Top = VisitStack.back(); + const BasicBlock *ParentBB = Top.first; + succ_const_iterator &I = Top.second; + + bool FoundNew = false; + while (I != succ_end(ParentBB)) { + BB = *I++; + unsigned Weight = 2; + bool Critical = false; + const TerminatorInst *TI = ParentBB->getTerminator(); + unsigned SuccNum = GetSuccessorNumber(const_cast(ParentBB), + const_cast(BB)); + if (SuccNum >= 2) + Weight = 3; + if (SuccNum == 0) + Weight = 0; + if (isCriticalEdge(TI, SuccNum)) { + Weight = 4; + Critical = true; + } + + if (Visited.insert(BB).second) { + FoundNew = true; + } + + // Successor is in VisitStack, it's a back edge. + if (!FoundNew && InStack.count(BB)) + Weight = 6; + + Edge *Ei = addEdge(ParentBB, BB, Weight); + Ei->IsCritical = Critical; + DEBUG(dbgs() << " Edge: from " << ParentBB->getName() << " to " + << BB->getName() << " w = " << Weight << "\n"); + if (FoundNew) + break; + } + + if (FoundNew) { + // Go down one level if there is a unvisited successor. + InStack.insert(BB); + VisitStack.push_back(std::make_pair(BB, succ_begin(BB))); + // If no successors, add a fake edge. + if (BB->getTerminator()->getNumSuccessors() == 0) + addEdge(BB, 0); + } else { + // Go up one level. + InStack.erase(VisitStack.pop_back_val().first); + } + } while (!VisitStack.empty()); + NumOfLatePGOBB += BBInfos.size(); + NumOfLatePGOEdge += AllEdges.size(); +} + +// Union find algorithm implementation for minimum spanning tree. + +// find the root group of the G and compress the path from G to the root. +PGOLateInstrumentationFunc::BBInfo * +PGOLateInstrumentationFunc::findAndCompressGroup(BBInfo *G) { + if (G->Group != G) + G->Group = findAndCompressGroup(G->Group); + return G->Group; +} + +// Union BB1 and BB2 into the same group and return true. +// Returns false if BB1 and BB2 are already in the same group. +bool PGOLateInstrumentationFunc::unionGroups(const BasicBlock *BB1, + const BasicBlock *BB2) { + BBInfo *BB1G = findAndCompressGroup(getBBInfo(BB1)); + BBInfo *BB2G = findAndCompressGroup(getBBInfo(BB2)); + + if (BB1G == BB2G) + return false; + + // Make the smaller rank tree a direct child or the root of high rank tree. + if (BB1G->Rank < BB2G->Rank) + BB1G->Group = BB2G; + else { + BB2G->Group = BB1G; + // If the ranks are the same, increment root of one tree by one. + if (BB1G->Rank == BB2G->Rank) + BB1G->Rank++; + } + return true; +} + +// Traverse all the edges and compute the Minimum Weight Spanning Tree +// using union-find algorithm. +void PGOLateInstrumentationFunc::computeMinimumSpanningTree() { + // First, put all the critical edge with landing-pad as the Dest to MST. + // This works around the insufficient support of critical edges split + // when destination BB is a landing pad. + // FE-based instrumentation in theory needs the same support. But it just + // ignores this issue and instrumenting on the source BB. The coverage result + // of FE-based instrumentation is incorrect for these cases. + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (Ei->IsCritical) { + if (Ei->DestBB && Ei->DestBB->isLandingPad()) { + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } + } + } + + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + if (unionGroups(Ei->SrcBB, Ei->DestBB)) + Ei->InMST = true; + } +} + +// Dump the Debug information about the instrumentation. +void PGOLateInstrumentationFunc::dumpEdges(bool DumpCount) const { + DEBUG(dbgs() << "Dumping Function: " << FuncName + << "\t Hash: " << FunctionHash << "\n"); + DEBUG(dbgs() << " Number of Basic Blocks: " << BBInfos.size() << "\n" + << " Index 0 : FakeNode\n"); + for (auto &BB : F) { + DEBUG(dbgs() << " Index " << getBBInfo(&BB)->Index << " : " + << BB.getName()); + if (DumpCount && getBBInfo(&BB)->CountValid) + DEBUG(dbgs() << " : " << getBBInfo(&BB)->CountValue); + DEBUG(dbgs() << "\n"); + } + uint64_t WeightSum = 0; + DEBUG(dbgs() << " Edges: " << AllEdges.size() + << " (*: Instrument, F: FakeEdge, -: removed)\n"); + for (unsigned i = 0, E = AllEdges.size(); i < E; i++) { + Edge *Ei = AllEdges[i]; + if (Ei->Removed) + DEBUG(dbgs() << " -"); + else if (!Ei->InMST) { + DEBUG(dbgs() << " *"); + WeightSum += Ei->Weight; + } else + DEBUG(dbgs() << " "); + DEBUG(dbgs() << " Edge " << i << ": " << getBBInfo(Ei->SrcBB)->Index + << "-->" << getBBInfo(Ei->DestBB)->Index); + DEBUG(dbgs() << " w = " << Ei->Weight); + if (DumpCount && !Ei->Removed) { + if (Ei->CountValid) + DEBUG(dbgs() << " : " << Ei->CountValue); + } + DEBUG(dbgs() << "\n"); + } +} + +// Sort CFG edges based on its weight. +void PGOLateInstrumentationFunc::sortEdges() { + std::sort(AllEdges.begin(), AllEdges.end(), CompareEdgeWeight()); +} + +// set the function name and prefix module name for local linkage to avoid +// the potential name conflict. +void PGOLateInstrumentationFunc::setFuncName( + StringRef Name, GlobalValue::LinkageTypes Linkage) { + StringRef RawFuncName = Name; + + // Function names may be prefixed with a binary '1' to indicate + // that the back-end should not modify the symbols due to any platform + // naming convention. Do not include that '1' in the PGO profile name. + if (RawFuncName[0] == '\1') + RawFuncName = RawFuncName.substr(1); + + FuncName = RawFuncName; + + if (GlobalValue::isLocalLinkage(Linkage)) { + if (M->getName().empty()) + FuncName = FuncName.insert(0, ":"); + else + FuncName = FuncName.insert(0, M->getName().str() + ":"); + } + + if (!IsUse) + createFuncNameVar(Linkage); +} + +void PGOLateInstrumentationFunc::createFuncNameVar( + GlobalValue::LinkageTypes Linkage) { + // Usually, we want to match the function's linkage, but + // available_externally and extern_weak both have the wrong semantics. + if (Linkage == GlobalValue::ExternalWeakLinkage) + Linkage = GlobalValue::LinkOnceAnyLinkage; + else if (Linkage == GlobalValue::AvailableExternallyLinkage) + Linkage = GlobalValue::LinkOnceODRLinkage; + else if (Linkage == GlobalValue::InternalLinkage || + Linkage == GlobalValue::ExternalLinkage) + Linkage = GlobalValue::PrivateLinkage; + + if (F.hasComdat()) + Linkage = GlobalValue::WeakAnyLinkage; + + auto *Value = ConstantDataArray::getString(M->getContext(), FuncName, false); + FuncNameVar = new GlobalVariable(*M, Value->getType(), true, Linkage, Value, + "__llvm_profile_name_" + FuncName); + + // Hide the symbol so that we correctly get a copy for each executable. + if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); +} + +BasicBlock *PGOLateInstrumentationFunc::getInstrBB(Edge *E) { + if (E->InMST || E->Removed) + return NULL; + + BasicBlock *SrcBB = const_cast(E->SrcBB); + BasicBlock *DestBB = const_cast(E->DestBB); + // For a fake edge, instrument the real BB. + if (SrcBB == NULL) + return DestBB; + if (DestBB == NULL) + return SrcBB; + + // Instrument the SrcBB if it has a single successor, + // otherwise, the DestBB if this is not a critical edge. + TerminatorInst *TI = SrcBB->getTerminator(); + if (TI->getNumSuccessors() <= 1) + return SrcBB; + if (!E->IsCritical) + return DestBB; + + // For a critical edge, we have to split. Instrument the newly + // created BB. + NumOfLatePGOSplit++; + DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB)->Index << " --> " + << getBBInfo(DestBB)->Index << "\n"); + unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB); + BasicBlock *InstrBB = SplitCriticalEdge(TI, SuccNum); + assert(InstrBB && "Critical edge is not split"); + + E->Removed = true; + return InstrBB; +} + +// Visit all edge and instrument the edges not in MST. +// Critical edges will be split. +void PGOLateInstrumentationFunc::instrumentCFG() { + unsigned NumCounters = 0; + for (auto &Ei : AllEdges) { + if (!Ei->InMST && !Ei->Removed) + NumCounters++; + } + NumOfLatePGOInstrument += NumCounters; + + for (unsigned i = 0, j = 0, E = AllEdges.size(); i < E; i++) { + Edge *Ei = AllEdges[i]; + BasicBlock *InstrBB = getInstrBB(Ei); + if (!InstrBB) + continue; + + IRBuilder<> Builder(InstrBB->getFirstInsertionPt()); + assert(Builder.GetInsertPoint() && "Cannot get the Instrumentation point"); + auto *I8PtrTy = Type::getInt8PtrTy(M->getContext()); + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment), + {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), + Builder.getInt64(FunctionHash), Builder.getInt32(NumCounters), + Builder.getInt32(j++)}); + } +} + +// Visit all the edges and assign the count value for the instrumented +// edges and the BB. +void PGOLateInstrumentationFunc::setInstrumentedCounts( + const std::vector *CountFromProfile) { + for (unsigned i = 0, j = 0, E = AllEdges.size(); i < E; i++) { + Edge *Ei = AllEdges[i]; + BasicBlock *InstrBB = getInstrBB(Ei); + if (!InstrBB) { + continue; + } + uint64_t CountValue = (*CountFromProfile)[j++]; + + if (Ei->Removed) { + unsigned Index = BBInfos.size(); + BBInfos.insert(std::make_pair(InstrBB, new BBInfo(Index))); + BasicBlock *SrcBB = const_cast(Ei->SrcBB); + BasicBlock *DestBB = const_cast(Ei->DestBB); + + // Add new edge of SrcBB->InstrBB. + Edge *NewEdge = addEdge(SrcBB, InstrBB); + setEdgeCount(NewEdge, CountValue); + + // Add new edge of InstrBB->DestBB. + NewEdge = addEdge(InstrBB, DestBB); + setEdgeCount(NewEdge, CountValue); + NewEdge->InMST = true; + } else + setEdgeCount(Ei, CountValue); + + setBBCount(InstrBB, CountValue); + } +} + +// Sum up the count values for all the edges. +uint64_t PGOLateInstrumentationFunc::sumEdgeCount(const DirectEdges &Edges) { + uint64_t Total = 0; + for (auto &Ei : Edges) { + if (Ei->Removed) + continue; + Total += Ei->CountValue; + } + return Total; +} + +// Set the count value for the unknown edges. There should be one and only one +// unknown edge in Edges vector. +void PGOLateInstrumentationFunc::setUnknownCountEdge(DirectEdges &Edges, + uint64_t Value) { + for (auto &Ei : Edges) { + if (Ei->CountValid) + continue; + setEdgeCount(Ei, Value); + + getBBInfo(Ei->SrcBB)->UnknownCountOutEdge--; + getBBInfo(Ei->DestBB)->UnknownCountInEdge--; + return; + } + llvm_unreachable("Cannot find the unknown count edge"); +} + +// Read the profile from ProfileFileName and assign the value to th +// instrumented BB and the edges. This function also updates ProgramMaxCount. +// Return true if the profile are successfully read, and false on errors. +bool PGOLateInstrumentationFunc::readCounters(StringRef ProfileFileName) { + DEBUG(dbgs() << "Read in profile counters: "); + // Read the counter array from file. + std::unique_ptr PGOReader; + auto ReaderOrErr = IndexedInstrProfReader::create(ProfileFileName); + if (ReaderOrErr.getError()) { + errs() << FuncName << " Error in open profile " << ProfileFileName << "\n"; + return false; + } + + PGOReader = std::move(ReaderOrErr.get()); + if (!PGOReader) { + errs() << " Cannot get PGOReader: " << ProfileFileName << "\n"; + return false; + } + + std::vector CountFromProfile; + if (std::error_code EC = PGOReader->getFunctionCounts(FuncName, FunctionHash, + CountFromProfile)) { + if (EC == instrprof_error::unknown_function) + errs() << FuncName << " missing function \n"; + else if (EC == instrprof_error::hash_mismatch) + errs() << FuncName << " mismatch profile: " << FunctionHash << "\n"; + else if (EC == instrprof_error::malformed) + errs() << FuncName << " malformed profile \n"; + else + errs() << FuncName << " something wrong \n"; + return false; + } + + DEBUG(dbgs() << CountFromProfile.size() << " counts\n"); + uint64_t ValueSum = 0; + for (unsigned i = 0, E = CountFromProfile.size(); i < E; i++) { + DEBUG(dbgs() << " " << i << ": " << CountFromProfile[i] << "\n"); + ValueSum += CountFromProfile[i]; + } + + DEBUG(dbgs() << "SUM = " << ValueSum << "\n"); + + getBBInfo(reinterpret_cast(NULL))->UnknownCountOutEdge = 2; + getBBInfo(reinterpret_cast(NULL))->UnknownCountInEdge = 2; + + setInstrumentedCounts(&CountFromProfile); + + // ProgramMaxCount is computed in llvm-profdata merge operation. + // Late instrumentation handles it differently from FE-based instrumentation. + ProgramMaxCount = PGOReader->getMaximumFunctionCount(); + return true; +} + +// Populate the counters from instrumented BBs to all BBs. +// In the end of this operation, all BBs should have a valid count value. +void PGOLateInstrumentationFunc::populateCounters() { + // First set up Count variable for all BBs. + for (auto &Ei : AllEdges) { + if (Ei->Removed) + continue; + + BasicBlock *SrcBB = const_cast(Ei->SrcBB); + BasicBlock *DestBB = const_cast(Ei->DestBB); + BBInfo *SrcInfo = getBBInfo(SrcBB); + BBInfo *DestInfo = getBBInfo(DestBB); + SrcInfo->OutEdges.push_back(Ei); + DestInfo->InEdges.push_back(Ei); + SrcInfo->UnknownCountOutEdge++; + DestInfo->UnknownCountInEdge++; + + if (!Ei->CountValid) + continue; + DestInfo->UnknownCountInEdge--; + SrcInfo->UnknownCountOutEdge--; + } + + bool Changes = true; + unsigned NumPasses = 0; + + // For efficient traversal, it's better to start from the end as most + // of the instrumented edges are at the end. There is no existing reverse + // iterator. So we build a reverse list first. + std::vector BBInReverseOrder; + std::vector::iterator It = BBInReverseOrder.begin(); + for (auto &BB : F) + It = BBInReverseOrder.insert(It, &BB); + + while (Changes) { + NumPasses++; + Changes = false; + for (auto &BB : BBInReverseOrder) { + BBInfo *Count = getBBInfo(BB); + if (!Count->CountValid) { + if (Count->UnknownCountOutEdge == 0) { + Count->CountValue = sumEdgeCount(Count->OutEdges); + Count->CountValid = true; + Changes = true; + } else if (Count->UnknownCountInEdge == 0) { + Count->CountValue = sumEdgeCount(Count->InEdges); + Count->CountValid = true; + Changes = true; + } + } + if (Count->CountValid) { + if (Count->UnknownCountOutEdge == 1) { + uint64_t Total = Count->CountValue - sumEdgeCount(Count->OutEdges); + setUnknownCountEdge(Count->OutEdges, Total); + Changes = true; + } + if (Count->UnknownCountInEdge == 1) { + uint64_t Total = Count->CountValue - sumEdgeCount(Count->InEdges); + setUnknownCountEdge(Count->InEdges, Total); + Changes = true; + } + } + } + } + + DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes."); + // Assert every BB has a valid counter. + uint64_t FuncEntryCount = getBBInfo(F.begin())->CountValue; + uint64_t FuncMaxCount = FuncEntryCount; + for (auto &BB : F) { + assert(getBBInfo(&BB)->CountValid && "BB count is not valid"); + uint64_t Count = getBBInfo(&BB)->CountValue; + if (Count > FuncMaxCount) + FuncMaxCount = Count; + } + applyFunctionAttributes(FuncEntryCount, FuncMaxCount); +} + +/// \brief Calculate what to divide by to scale counts. +/// +/// Given the maximum count, calculate a divisor that will scale all the +/// weights to strictly less than UINT32_MAX. +static uint64_t calculateCountScale(uint64_t MaxCount) { + return MaxCount < UINT32_MAX ? 1 : MaxCount / UINT32_MAX + 1; +} + +/// \brief Scale an individual branch count (and add 1). +/// +/// Scale a 64-bit weight down to 32-bits using \c Scale. +/// +/// According to Laplace's Rule of Succession, it is better to compute the +/// count based on the count plus 1, so universally add 1 to the value. +/// +/// \pre \c Scale was calculated by \a calculateCountScale() with a count no +/// greater than \c Count. +static uint32_t scaleBranchCount(uint64_t Count, uint64_t Scale) { + uint64_t Scaled = Count / Scale + 1; + assert(Scaled <= UINT32_MAX && "overflow 32-bits"); + return Scaled; +} + +// Assign the scaled count values to the BB with multiple out edges. +void PGOLateInstrumentationFunc::setBranchWeights() { + // Generate MD_prof metadata for every branch instruction. + DEBUG(dbgs() << "\nSetting branch weights.\n"); + MDBuilder MDB(M->getContext()); + for (auto &BB : F) { + TerminatorInst *TI = BB.getTerminator(); + if (TI->getNumSuccessors() < 2) + continue; + if (!isa(TI) && !isa(TI)) + continue; + if (getBBInfo(&BB)->CountValue == 0) + continue; + + // We have a non-zero Branch BB. + auto BBCountIt = getBBInfo(&BB); + unsigned Size = BBCountIt->OutEdges.size(); + SmallVector EdgeCounts; + EdgeCounts.resize(Size); + for (unsigned s = 0; s < Size; s++) + EdgeCounts[s] = 0; + uint64_t MaxCount; + for (unsigned s = 0; s < Size; s++) { + const Edge *E = BBCountIt->OutEdges[s]; + const BasicBlock *SrcBB = E->SrcBB; + const BasicBlock *DestBB = E->DestBB; + if (DestBB == 0) + continue; + unsigned SuccNum = GetSuccessorNumber(const_cast(SrcBB), + const_cast(DestBB)); + uint64_t EdgeCount = E->CountValue; + if (EdgeCount > MaxCount) + MaxCount = EdgeCount; + EdgeCounts[SuccNum] = EdgeCount; + } + assert(MaxCount > 0 && "Bad max count"); + uint64_t Scale = calculateCountScale(MaxCount); + SmallVector Weights; + for (unsigned s = 0; s < Size; s++) + Weights.push_back(scaleBranchCount(EdgeCounts[s], Scale)); + + TI->setMetadata(llvm::LLVMContext::MD_prof, + MDB.createBranchWeights(Weights)); + } +} +} // end namespace llvm + +bool PGOLateInstrumentation::runOnModule(Module &M) { + for (auto &F : M) { + if (F.isDeclaration()) + continue; + PGOLateInstrumentationFunc Func(F, &M, ProfileFileName); + } + return false; +}