Index: include/llvm/Analysis/SyntheticCountsUtils.h =================================================================== --- /dev/null +++ include/llvm/Analysis/SyntheticCountsUtils.h @@ -0,0 +1,33 @@ +//===- SyntheticCountsUtils.h - utilities for count propagation--*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines utilities for synthetic counts propagation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_SYNTHETIC_COUNTS_UTILS_H +#define LLVM_ANALYSIS_SYNTHETIC_COUNTS_UTILS_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/CallSite.h" +#include "llvm/Support/ScaledNumber.h" + +namespace llvm { + +class CallGraph; +class Function; + +using Scaled64 = ScaledNumber; +void propagateSyntheticCounts( + const CallGraph &CG, function_ref GetCallSiteRelFreq, + function_ref GetCount, + function_ref AddToCount); +} // namespace llvm + +#endif Index: include/llvm/IR/Function.h =================================================================== --- include/llvm/IR/Function.h +++ include/llvm/IR/Function.h @@ -236,10 +236,11 @@ /// \brief Set the entry count for this function. /// /// Entry count is the number of times this function was executed based on - /// pgo data. \p Imports points to a set of GUIDs that needs to be imported - /// by the function for sample PGO, to enable the same inlines as the - /// profiled optimized binary. - void setEntryCount(uint64_t Count, + /// pgo data. \p Synthetic indicates the count is synthesized by analysis and + /// not from a profile run. \p Imports points to a set of GUIDs that needs to + /// be imported by the function for sample PGO, to enable the same inlines as + /// the profiled optimized binary. + void setEntryCount(uint64_t Count, bool Synthetic = false, const DenseSet *Imports = nullptr); /// \brief Get the entry count for this function. Index: include/llvm/IR/MDBuilder.h =================================================================== --- include/llvm/IR/MDBuilder.h +++ include/llvm/IR/MDBuilder.h @@ -66,10 +66,11 @@ /// Return metadata specifying that a branch or switch is unpredictable. MDNode *createUnpredictable(); - /// Return metadata containing the entry \p Count for a function, and the + /// Return metadata containing the entry \p Count for a function, a boolean + /// \Synthetic indicating whether the counts were synthetized, and the /// GUIDs stored in \p Imports that need to be imported for sample PGO, to /// enable the same inlines as the profiled optimized binary - MDNode *createFunctionEntryCount(uint64_t Count, + MDNode *createFunctionEntryCount(uint64_t Count, bool Synthetic, const DenseSet *Imports); /// Return metadata containing the section prefix for a function. Index: include/llvm/Transforms/IPO/SyntheticCountsPropagation.h =================================================================== --- /dev/null +++ include/llvm/Transforms/IPO/SyntheticCountsPropagation.h @@ -0,0 +1,19 @@ +#ifndef LLVM_TRANSFORMS_IPO_SYNTHETIC_COUNTS_PROPAGATION_H +#define LLVM_TRANSFORMS_IPO_SYNTHETIC_COUNTS_PROPAGATION_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/ScaledNumber.h" + +namespace llvm { +class Function; +class Module; + +class SyntheticCountsPropagation + : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM); +}; +} // namespace llvm +#endif Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -74,6 +74,7 @@ ScalarEvolutionAliasAnalysis.cpp ScalarEvolutionExpander.cpp ScalarEvolutionNormalization.cpp + SyntheticCountsUtils.cpp TargetLibraryInfo.cpp TargetTransformInfo.cpp Trace.cpp Index: lib/Analysis/SyntheticCountsUtils.cpp =================================================================== --- /dev/null +++ lib/Analysis/SyntheticCountsUtils.cpp @@ -0,0 +1,122 @@ +//===--- SyntheticCountsUtils.cpp - synthetic counts propagation utils ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines utilities for propagating synthetic counts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/SyntheticCountsUtils.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" + +using namespace llvm; + +// Given a set of functions in an SCC, propagate entry counts to functions +// called by the SCC. +static void +propagateFromSCC(const SmallPtrSetImpl &SCCFunctions, + function_ref GetCallSiteRelFreq, + function_ref GetCount, + function_ref AddToCount) { + + SmallVector CallSites; + + // Gather all callsites in the SCC. + auto GatherCallSites = [&]() { + for (auto *F : SCCFunctions) { + assert(F && !F->isDeclaration()); + for (auto &I : instructions(F)) { + if (auto CS = CallSite(&I)) { + CallSites.push_back(CS); + } + } + } + }; + + GatherCallSites(); + + // Partition callsites so that the callsites that call functions in the same + // SCC come first. + auto Mid = partition(CallSites, [&](CallSite &CS) { + auto *Callee = CS.getCalledFunction(); + if (Callee) + return SCCFunctions.count(Callee); + // FIXME: Use the !callees metadata to propagate counts through indirect + // calls. + return 0U; + }); + + // For functions in the same SCC, update the counts in two steps: + // 1. Compute the additional count for each function by propagating the counts + // along all incoming edges to the function that originate from the same SCC + // and summing them up. + // 2. Add the additional counts to the functions in the SCC. + // This ensures that the order of + // traversal of functions within the SCC doesn't change the final result. + + DenseMap AdditionalCounts; + for (auto It = CallSites.begin(); It != Mid; It++) { + auto &CS = *It; + auto RelFreq = GetCallSiteRelFreq(CS); + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + RelFreq *= Scaled64(GetCount(Caller), 0); + uint64_t AdditionalCount = RelFreq.toInt(); + AdditionalCounts[Callee] += AdditionalCount; + } + + // Update the counts for the functions in the SCC. + for (auto &Entry : AdditionalCounts) + AddToCount(Entry.first, Entry.second); + + // Now update the counts for functions not in SCC. + for (auto It = Mid; It != CallSites.end(); It++) { + auto &CS = *It; + auto Weight = GetCallSiteRelFreq(CS); + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + Weight *= Scaled64(GetCount(Caller), 0); + AddToCount(Callee, Weight.toInt()); + } +} + +/// Propgate synthetic entry counts on a callgraph. +/// +/// This performs a reverse post-order traversal of the callgraph SCC. For each +/// SCC, it first propagates the entry counts to the functions within the SCC +/// through call edges and updates them in one shot. Then the entry counts are +/// propagated to functions outside the SCC. +void llvm::propagateSyntheticCounts( + const CallGraph &CG, function_ref GetCallSiteRelFreq, + function_ref GetCount, + function_ref AddToCount) { + + SmallVector, 16> SCCs; + for (auto I = scc_begin(&CG); !I.isAtEnd(); ++I) { + auto SCC = *I; + + SmallPtrSet SCCFunctions; + for (auto *Node : SCC) { + Function *F = Node->getFunction(); + if (F && !F->isDeclaration()) { + SCCFunctions.insert(F); + } + } + SCCs.push_back(SCCFunctions); + } + + for (auto &SCCFunctions : reverse(SCCs)) + propagateFromSCC(SCCFunctions, GetCallSiteRelFreq, GetCount, AddToCount); +} Index: lib/IR/Function.cpp =================================================================== --- lib/IR/Function.cpp +++ lib/IR/Function.cpp @@ -1320,10 +1320,11 @@ setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit)); } -void Function::setEntryCount(uint64_t Count, +void Function::setEntryCount(uint64_t Count, bool Synthetic, const DenseSet *S) { MDBuilder MDB(getContext()); - setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S)); + setMetadata(LLVMContext::MD_prof, + MDB.createFunctionEntryCount(Count, Synthetic, S)); } Optional Function::getEntryCount() const { Index: lib/IR/MDBuilder.cpp =================================================================== --- lib/IR/MDBuilder.cpp +++ lib/IR/MDBuilder.cpp @@ -58,10 +58,14 @@ } MDNode *MDBuilder::createFunctionEntryCount( - uint64_t Count, const DenseSet *Imports) { + uint64_t Count, bool Synthetic, + const DenseSet *Imports) { Type *Int64Ty = Type::getInt64Ty(Context); SmallVector Ops; - Ops.push_back(createString("function_entry_count")); + if (Synthetic) + Ops.push_back(createString("synthetic_function_entry_count")); + else + Ops.push_back(createString("function_entry_count")); Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count))); if (Imports) { SmallVector OrderID(Imports->begin(), Imports->end()); Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -1692,8 +1692,11 @@ "expected string with name of the !prof annotation", MD); MDString *MDS = cast(MD->getOperand(0)); StringRef ProfName = MDS->getString(); - Assert(ProfName.equals("function_entry_count"), - "first operand should be 'function_entry_count'", MD); + Assert(ProfName.equals("function_entry_count") || + ProfName.equals("synthetic_function_entry_count"), + "first operand should be 'function_entry_count'" + " or 'synthetic_function_entry_count'", + MD); // Check second operand. Assert(MD->getOperand(1) != nullptr, "second operand should not be null", Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -80,6 +80,7 @@ #include "llvm/Transforms/IPO/PartialInlining.h" #include "llvm/Transforms/IPO/SCCP.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" +#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" #include "llvm/Transforms/IPO/WholeProgramDevirt.h" #include "llvm/Transforms/InstCombine/InstCombine.h" #include "llvm/Transforms/InstrProfiling.h" @@ -176,6 +177,11 @@ "enable-npm-gvn-sink", cl::init(false), cl::Hidden, cl::desc("Enable the GVN hoisting pass for the new PM (default = off)")); +static cl::opt EnableSyntheticCounts( + "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Run synthetic function entry count generation " + "pass")); + static Regex DefaultAliasRegex( "^(default|thinlto-pre-link|thinlto|lto-pre-link|lto)<(O[0123sz])>$"); @@ -621,6 +627,10 @@ MPM.addPass(PGOIndirectCallPromotion(false, false)); } + // Synthesize function entry counts for non-PGO compilation. + if (EnableSyntheticCounts && !PGOOpt) + MPM.addPass(SyntheticCountsPropagation()); + // Require the GlobalsAA analysis for the module so we can query it within // the CGSCC pipeline. MPM.addPass(RequireAnalysisPass()); Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -73,6 +73,7 @@ MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass()) MODULE_PASS("sample-profile", SampleProfileLoaderPass()) MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass()) +MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation()) MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass()) MODULE_PASS("verify", VerifierPass()) #undef MODULE_PASS Index: lib/Transforms/IPO/CMakeLists.txt =================================================================== --- lib/Transforms/IPO/CMakeLists.txt +++ lib/Transforms/IPO/CMakeLists.txt @@ -29,6 +29,7 @@ SampleProfile.cpp StripDeadPrototypes.cpp StripSymbols.cpp + SyntheticCountsPropagation.cpp ThinLTOBitcodeWriter.cpp WholeProgramDevirt.cpp Index: lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- lib/Transforms/IPO/SampleProfile.cpp +++ lib/Transforms/IPO/SampleProfile.cpp @@ -1466,7 +1466,7 @@ // Sets the GUIDs that are inlined in the profiled binary. This is used // for ThinLink to make correct liveness analysis, and also make the IR // match the profiled binary before annotation. - F.setEntryCount(Samples->getHeadSamples() + 1, &InlinedGUIDs); + F.setEntryCount(Samples->getHeadSamples() + 1, false, &InlinedGUIDs); // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); Index: lib/Transforms/IPO/SyntheticCountsPropagation.cpp =================================================================== --- /dev/null +++ lib/Transforms/IPO/SyntheticCountsPropagation.cpp @@ -0,0 +1,127 @@ +//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a transformation that synthesizes entry counts for +// functions and attaches !prof metadata to functions with the synthesized +// counts. The presence of !prof metadata with counter name set to +// 'synthesized_function_entry_count' indicate that the value of the counter is +// an estimation of the likely execution count of the function. This transform +// is applied only in non PGO mode as functions get 'real' profile-based +// function entry counts in the PGO mode. +// +// The transformation works by first assigning some initial values to the entry +// counts of all functions and then doing a top-down traversal of the +// callgraph-scc to propagate the counts. For each function the set of callsites +// and their relative block frequency is gathered. The relative block frequency +// multiplied by the entry count of the caller and added to the callee's entry +// count. For non-trivial SCCs, the new counts are computed from the previous +// counts and updated in one shot. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/SyntheticCountsUtils.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using Scaled64 = ScaledNumber; + +#define DEBUG_TYPE "synthetic-counts-propagation" + +/// Initial synthetic count assigned to functions. +static cl::opt + InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10), + cl::ZeroOrMore, + cl::desc("Initial value of synthetic entry count.")); + +/// Initial synthetic count assigned to inline functions. +static cl::opt InlineSyntheticCount( + "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore, + cl::desc("Initial synthetic entry count for inline functions.")); + +/// Initial synthetic count assigned to cold functions. +static cl::opt ColdSyntheticCount( + "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore, + cl::desc("Initial synthetic entry count for cold functions.")); + +// Assign initial synthetic entry counts to functions. +static void +initializeCounts(Module &M, function_ref SetCount) { + auto MayHaveIndirectCalls = [](Function &F) { + for (auto *U : F.users()) { + if (!isa(U) && !isa(U)) + return true; + } + return false; + }; + + for (Function &F : M) { + uint64_t InitialCount = InitialSyntheticCount; + if (F.isDeclaration()) + continue; + if (F.hasFnAttribute(Attribute::AlwaysInline) || + F.hasFnAttribute(Attribute::InlineHint)) { + // Use a higher value for inline functions to account for the fact that + // these are usually beneficial to inline. + InitialCount = InlineSyntheticCount; + } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) { + // Local functions without inline hints get counts only through + // propagation. + InitialCount = 0; + } else if (F.hasFnAttribute(Attribute::Cold) || + F.hasFnAttribute(Attribute::NoInline)) { + // Use a lower value for noinline and cold functions. + InitialCount = ColdSyntheticCount; + } + SetCount(&F, InitialCount); + } +} + +PreservedAnalyses SyntheticCountsPropagation::run(Module &M, + ModuleAnalysisManager &MAM) { + FunctionAnalysisManager &FAM = + MAM.getResult(M).getManager(); + DenseMap Counts; + // Set initial entry counts. + initializeCounts(M, [&](Function *F, uint64_t Count) { Counts[F] = Count; }); + + // Compute the relative block frequency for a callsite. Use scaled numbers + // and not integers since the relative block frequency could be less than 1. + auto GetCallSiteRelFreq = [&](CallSite CS) { + Function *Caller = CS.getCaller(); + auto &BFI = FAM.getResult(*Caller); + BasicBlock *CSBB = CS.getInstruction()->getParent(); + Scaled64 EntryFreq(BFI.getEntryFreq(), 0); + Scaled64 BBFreq(BFI.getBlockFreq(CSBB).getFrequency(), 0); + BBFreq /= EntryFreq; + return BBFreq; + }; + + CallGraph CG(M); + // Propgate the entry counts on the callgraph. + propagateSyntheticCounts( + CG, GetCallSiteRelFreq, [&](Function *F) { return Counts[F]; }, + [&](Function *F, uint64_t New) { Counts[F] += New; }); + + // Set the counts as metadata. + for (auto Entry : Counts) + Entry.first->setEntryCount(Entry.second, true); + + return PreservedAnalyses::all(); +} Index: test/Transforms/SyntheticCountsPropagation/initial.ll =================================================================== --- /dev/null +++ test/Transforms/SyntheticCountsPropagation/initial.ll @@ -0,0 +1,79 @@ +; RUN: opt -passes=synthetic-counts-propagation -S < %s | FileCheck %s + +; CHECK-LABEL: define void @foo() +; CHECK: !prof ![[COUNT1:[0-9]+]] +define void @foo() { + ret void +} + +; CHECK-LABEL: define void @foo_inline() #0 +; CHECK: !prof ![[COUNT2:[0-9]+]] +define void @foo_inline() #0 { + ret void +} + +; CHECK-LABEL: define void @foo_always_inline() #1 +; CHECK: !prof ![[COUNT2]] +define void @foo_always_inline() #1 { + ret void +} + +; CHECK-LABEL: define void @foo_cold() #2 +; CHECK: !prof ![[COUNT3:[0-9]+]] +define void @foo_cold() #2 { + ret void +} + +; CHECK-LABEL: define void @foo_noinline() #3 +; CHECK: !prof ![[COUNT3]] +define void @foo_noinline() #3 { + ret void +} + +; CHECK-LABEL: define internal void @foo_local() +; CHECK: !prof ![[COUNT4:[0-9]+]] +define internal void @foo_local() { + ret void +} + +; CHECK-LABEL: define internal void @foo_local_escaped() +; CHECK: !prof ![[COUNT1]] +define internal void @foo_local_escaped() { + ret void +} + +declare void @ext(void ()*) + +define void @bar() { + call void @ext(void ()* nonnull @foo_local_escaped) + ret void +} + +; CHECK-LABEL: define internal void @foo_local_inline() #0 +; CHECK: !prof ![[COUNT2]] +define internal void @foo_local_inline() #0 { + ret void +} + +; CHECK-LABEL: define internal void @foo_local_cold() #2 +; CHECK: !prof ![[COUNT4]] +define internal void @foo_local_cold() #2 { + ret void +} + +; CHECK-LABEL: define linkonce void @foo_linkonce() +; CHECK: !prof ![[COUNT1]] +define linkonce void @foo_linkonce() { + ret void +} + +; CHECK: ![[COUNT1]] = !{!"synthetic_function_entry_count", i64 10} +; CHECK: ![[COUNT2]] = !{!"synthetic_function_entry_count", i64 15} +; CHECK: ![[COUNT3]] = !{!"synthetic_function_entry_count", i64 5} +; CHECK: ![[COUNT4]] = !{!"synthetic_function_entry_count", i64 0} + +attributes #0 = {inlinehint} +attributes #1 = {alwaysinline} +attributes #2 = {cold} +attributes #3 = {noinline} + Index: test/Transforms/SyntheticCountsPropagation/prop.ll =================================================================== --- /dev/null +++ test/Transforms/SyntheticCountsPropagation/prop.ll @@ -0,0 +1,50 @@ +; RUN: opt -passes=synthetic-counts-propagation -S < %s | FileCheck %s + +; CHECK-LABEL: define void @level1a(i32 %n) +; CHECK: !prof ![[COUNT1:[0-9]+]] +define void @level1a(i32 %n) { +entry: + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %exit, label %loop +loop: + %i = phi i32 [%n, %entry], [%i1, %loop] + call void @level2a(i32 %n) + %i1 = sub i32 %i, 1 + %cmp2 = icmp eq i32 %i1, 0 + br i1 %cmp2, label %exit, label %loop, !prof !1 +exit: + ret void +} + +; CHECK-LABEL: define void @level2a(i32 %n) +; CHECK: !prof ![[COUNT2:[0-9]+]] +define void @level2a(i32 %n) { + call void @level2b(i32 %n) + ret void +} + +; CHECK-LABEL: define void @level2b(i32 %n) +; CHECK: !prof ![[COUNT2]] +define void @level2b(i32 %n) { +entry: + call void @level2a(i32 %n) + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %then, label %else, !prof !2 +then: + call void @level3a(i32 %n) + br label %else +else: + ret void +} + +; CHECK-LABEL: define internal void @level3a(i32 %n) +; CHECK: !prof ![[COUNT3:[0-9]+]] +define internal void @level3a(i32 %n) { + ret void +} + +!1 = !{!"branch_weights", i32 1, i32 99} +!2 = !{!"branch_weights", i32 1, i32 1} +; CHECK: ![[COUNT1]] = !{!"synthetic_function_entry_count", i64 10} +; CHECK: ![[COUNT2]] = !{!"synthetic_function_entry_count", i64 520} +; CHECK: ![[COUNT3]] = !{!"synthetic_function_entry_count", i64 260} Index: test/Transforms/SyntheticCountsPropagation/scc.ll =================================================================== --- /dev/null +++ test/Transforms/SyntheticCountsPropagation/scc.ll @@ -0,0 +1,19 @@ +; RUN: opt -passes=synthetic-counts-propagation -S < %s | FileCheck %s + +; CHECK-LABEL: define void @foo() +; CHECK: !prof ![[COUNT1:[0-9]+]] +define void @foo() { + call void @bar() + ret void +} + +; CHECK-LABEL: define void @bar() #0 +; CHECK: !prof ![[COUNT1]] +define void @bar() #0 { + call void @foo() + ret void +} + +attributes #0 = {inlinehint} + +; CHECK: ![[COUNT1]] = !{!"synthetic_function_entry_count", i64 25}