Index: llvm/include/llvm/CodeGen/FlowSensitiveSampleProfile.h =================================================================== --- /dev/null +++ llvm/include/llvm/CodeGen/FlowSensitiveSampleProfile.h @@ -0,0 +1,79 @@ +//===----- FlowSensitiveSampleProfile.h: FS SampleFDO Support ---*- c++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the supoorting functions for Flow Sensitive Sample FDO +// loader. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H +#define LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H + +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" + +#include + +namespace llvm { + +using namespace sampleprof; + +class FSProfileLoader; +class FSProfileLoaderPass : public MachineFunctionPass { + MachineFunction *MF; + std::string ProfileFileName; + FSDiscriminatorPass P; + unsigned LowBit; + unsigned HighBit; + +public: + static char ID; + /// FS bits will only use the '1' bits in the Mask. + FSProfileLoaderPass(std::string Filename = "", + FSDiscriminatorPass P = FSDiscriminatorPass::Pass1) + : MachineFunctionPass(ID), ProfileFileName(Filename), P(P), + FSSampleLoader(std::make_unique(Filename)) { + LowBit = getFSPassBitBegin(P); + HighBit = getFSPassBitEnd(P); + assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); + } + + /// getMachineFunction - Return the last machine function computed. + const MachineFunction *getMachineFunction() const { return MF; } + +private: + void init(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &) override; + bool doInitialization(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + std::unique_ptr FSSampleLoader; + /// Hold the information of the basic block frequency. + MachineBlockFrequencyInfo *MBFI; +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H Index: llvm/include/llvm/CodeGen/MachineDominators.h =================================================================== --- llvm/include/llvm/CodeGen/MachineDominators.h +++ llvm/include/llvm/CodeGen/MachineDominators.h @@ -112,6 +112,12 @@ return DT->dominates(A, B); } + void getDescendants(MachineBasicBlock *A, + SmallVectorImpl &Result) { + applySplitCriticalEdges(); + DT->getDescendants(A, Result); + } + bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const { applySplitCriticalEdges(); return DT->dominates(A, B); Index: llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h =================================================================== --- llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h +++ llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h @@ -118,6 +118,12 @@ : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis, PassName, RemarkName, Loc, MBB) {} + MachineOptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName, + const MachineInstr *MI) + : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis, + PassName, RemarkName, MI->getDebugLoc(), + MI->getParent()) {} + static bool classof(const DiagnosticInfo *DI) { return DI->getKind() == DK_MachineOptimizationRemarkAnalysis; } Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -171,6 +171,9 @@ /// This pass adds flow sensitive discriminators. extern char &MIRAddFSDiscriminatorsID; + /// This pass reads flow sensitive profile. + extern char &FSProfileLoaderPassID; + /// FastRegisterAllocation Pass - This pass register allocates as fast as /// possible. It is best suited for debug code where live ranges are short. /// @@ -513,6 +516,10 @@ FunctionPass * createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P); + /// Read Flow Sensitive Profile. + FunctionPass *createFSProfileLoaderPass(std::string File, + sampleprof::FSDiscriminatorPass P); + /// Creates MIR Debugify pass. \see MachineDebugify.cpp ModulePass *createDebugifyMachineModulePass(); Index: llvm/include/llvm/IR/DebugInfoMetadata.h =================================================================== --- llvm/include/llvm/IR/DebugInfoMetadata.h +++ llvm/include/llvm/IR/DebugInfoMetadata.h @@ -2212,7 +2212,8 @@ return getCopyIdentifierFromDiscriminator(getDiscriminator()); } -Optional DILocation::cloneWithBaseDiscriminator(unsigned D) const { +Optional +DILocation::cloneWithBaseDiscriminator(unsigned D) const { unsigned BD, DF, CI; if (EnableFSDiscriminator) { @@ -2225,12 +2226,15 @@ decodeDiscriminator(getDiscriminator(), BD, DF, CI); if (D == BD) return this; + if (EnableFSDiscriminator) + return cloneWithDiscriminator(D); if (Optional Encoded = encodeDiscriminator(D, DF, CI)) return cloneWithDiscriminator(*Encoded); return None; } -Optional DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const { +Optional +DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const { assert(!EnableFSDiscriminator && "FSDiscriminator should not call this."); DF *= getDuplicationFactor(); Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -64,6 +64,7 @@ void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeADCELegacyPassPass(PassRegistry&); void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&); +void initializeAddFSDiscriminatorsPass(PassRegistry &); void initializeModuleAddressSanitizerLegacyPassPass(PassRegistry &); void initializeASanGlobalsMetadataWrapperPassPass(PassRegistry &); void initializeAddressSanitizerLegacyPassPass(PassRegistry &); @@ -159,6 +160,7 @@ void initializeMakeGuardsExplicitLegacyPassPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); +void initializeFSProfileLoaderPassPass(PassRegistry &); void initializeFinalizeISelPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); void initializeFixIrreduciblePass(PassRegistry &); Index: llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h =================================================================== --- llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -56,11 +56,12 @@ using FunctionT = Function; using BlockFrequencyInfoT = BlockFrequencyInfo; using LoopT = Loop; - using LoopInfoT = LoopInfo; + using LoopInfoPtrT = std::unique_ptr; + using DominatorTreePtrT = std::unique_ptr; + using PostDominatorTreeT = PostDominatorTree; + using PostDominatorTreePtrT = std::unique_ptr; using OptRemarkEmitterT = OptimizationRemarkEmitter; using OptRemarkAnalysisT = OptimizationRemarkAnalysis; - using DominatorTreeT = DominatorTree; - using PostDominatorTreeT = PostDominatorTree; static Function &getFunction(Function &F) { return F; } static const BasicBlock *getEntryBB(const Function *F) { return &F->getEntryBlock(); @@ -85,14 +86,17 @@ typename afdo_detail::IRTraits::BlockFrequencyInfoT; using FunctionT = typename afdo_detail::IRTraits::FunctionT; using LoopT = typename afdo_detail::IRTraits::LoopT; - using LoopInfoT = typename afdo_detail::IRTraits::LoopInfoT; + using LoopInfoPtrT = typename afdo_detail::IRTraits::LoopInfoPtrT; + using DominatorTreePtrT = + typename afdo_detail::IRTraits::DominatorTreePtrT; + using PostDominatorTreePtrT = + typename afdo_detail::IRTraits::PostDominatorTreePtrT; + using PostDominatorTreeT = + typename afdo_detail::IRTraits::PostDominatorTreeT; using OptRemarkEmitterT = typename afdo_detail::IRTraits::OptRemarkEmitterT; using OptRemarkAnalysisT = typename afdo_detail::IRTraits::OptRemarkAnalysisT; - using DominatorTreeT = typename afdo_detail::IRTraits::DominatorTreeT; - using PostDominatorTreeT = - typename afdo_detail::IRTraits::PostDominatorTreeT; using BlockWeightMap = DenseMap; using EquivalenceClassMap = @@ -129,12 +133,11 @@ void findEquivalencesFor(BasicBlockT *BB1, ArrayRef Descendants, PostDominatorTreeT *DomTree); - void propagateWeights(FunctionT &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(FunctionT &F); bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount); - void clearFunctionData(); + void clearFunctionData(bool ResetDT = true); void computeDominanceAndLoopInfo(FunctionT &F); bool computeAndPropagateWeights(FunctionT &F, @@ -168,9 +171,9 @@ EquivalenceClassMap EquivalenceClass; /// Dominance, post-dominance and loop information. - std::unique_ptr DT; - std::unique_ptr PDT; - std::unique_ptr LI; + DominatorTreePtrT DT; + PostDominatorTreePtrT PDT; + LoopInfoPtrT LI; /// Predecessors for each basic block in the CFG. BlockEdgeMap Predecessors; @@ -199,15 +202,17 @@ /// Clear all the per-function data used to load samples and propagate weights. template -void SampleProfileLoaderBaseImpl::clearFunctionData() { +void SampleProfileLoaderBaseImpl::clearFunctionData(bool ResetDT) { BlockWeights.clear(); EdgeWeights.clear(); VisitedBlocks.clear(); VisitedEdges.clear(); EquivalenceClass.clear(); - DT = nullptr; - PDT = nullptr; - LI = nullptr; + if (ResetDT) { + DT = nullptr; + PDT = nullptr; + LI = nullptr; + } Predecessors.clear(); Successors.clear(); CoverageTracker.clear(); @@ -475,7 +480,7 @@ // class by making BB2's equivalence class be BB1. DominatedBBs.clear(); DT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, PDT.get()); + findEquivalencesFor(BB1, DominatedBBs, &*PDT); LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); } @@ -911,12 +916,12 @@ template void SampleProfileLoaderBaseImpl::computeDominanceAndLoopInfo( FunctionT &F) { - DT.reset(new DominatorTreeT); + DT.reset(new DominatorTree); DT->recalculate(F); PDT.reset(new PostDominatorTree(F)); - LI.reset(new LoopInfoT); + LI.reset(new LoopInfo); LI->analyze(*DT); } Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -33,6 +33,7 @@ FEntryInserter.cpp FinalizeISel.cpp FixupStatepointCallerSaved.cpp + FlowSensitiveSampleProfile.cpp FuncletLayout.cpp GCMetadata.cpp GCMetadataPrinter.cpp Index: llvm/lib/CodeGen/FlowSensitiveSampleProfile.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/FlowSensitiveSampleProfile.cpp @@ -0,0 +1,364 @@ +//===-------- FlowSensitiveSampleProfile.cpp: Flow Sensitive SampleFDO-----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the implementation of the flow sensitive SampleFDO. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/FlowSensitiveSampleProfile.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" +#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::sampleprofutil; +using ProfileCount = Function::ProfileCount; + +#define DEBUG_TYPE "fs-profile-loader" + +static cl::opt + DisableFSProfileLoader("disable-fs-profile-loader", cl::Hidden, + cl::init(false), + cl::desc("Disable flow senstive profile loading")); +static cl::opt EnableFSBranchProb( + "enable-fs-branchprob", cl::Hidden, cl::init(true), + cl::desc("Enable seting flow senstive branch probabilities")); +static cl::opt ShowFSBranchProb( + "show-fs-branchprob", cl::Hidden, cl::init(false), + cl::desc("Print setting flow senstive branch probabilities")); +static cl::opt FSProfileDebugProbDiffThreshold( + "fs-profile-debug-prob-diff-threshold", cl::init(10), + cl::desc("Only show debug message if the branch probility is greater than " + "this value (in percentage).")); + +static cl::opt FSProfileDebugBWThreshold( + "fs-profile-debug-bw-threshold", cl::init(10000), + cl::desc("Only show debug message if the source branch weight is greater " + " than this value.")); + +static cl::opt ViewBFIBefore("fs-viewbfi-before", cl::Hidden, + cl::init(false), + cl::desc("View BFI before FS loader")); +static cl::opt ViewBFIAfter("fs-viewbfi-after", cl::Hidden, + cl::init(false), + cl::desc("View BFI after FS loader")); + +char FSProfileLoaderPass::ID = 0; + +INITIALIZE_PASS_BEGIN(FSProfileLoaderPass, DEBUG_TYPE, + "Load Flow Sensitive Profile", + /* cfg = */ false, /* is_analysis = */ false) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) +INITIALIZE_PASS_END(FSProfileLoaderPass, DEBUG_TYPE, + "Load Flow Sensitive Profile", + /* cfg = */ false, /* is_analysis = */ false) + +char &llvm::FSProfileLoaderPassID = FSProfileLoaderPass::ID; + +FunctionPass *llvm::createFSProfileLoaderPass(std::string File, + FSDiscriminatorPass P) { + return new FSProfileLoaderPass(File, P); +} + +namespace llvm { + +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi={none | fraction | integer | count} +extern cl::opt ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= +extern cl::opt ViewBlockFreqFuncName; + +namespace afdo_detail { +template <> struct IRTraits { + using InstructionT = MachineInstr; + using BasicBlockT = MachineBasicBlock; + using FunctionT = MachineFunction; + using BlockFrequencyInfoT = MachineBlockFrequencyInfo; + using LoopT = MachineLoop; + using LoopInfoPtrT = MachineLoopInfo *; + using DominatorTreePtrT = MachineDominatorTree *; + using PostDominatorTreePtrT = MachinePostDominatorTree *; + using PostDominatorTreeT = MachinePostDominatorTree; + using OptRemarkEmitterT = MachineOptimizationRemarkEmitter; + using OptRemarkAnalysisT = MachineOptimizationRemarkAnalysis; + static Function &getFunction(MachineFunction &F) { return F.getFunction(); } + static const MachineBasicBlock *getEntryBB(const MachineFunction *F) { + return GraphTraits::getEntryNode(F); + } +}; +} // namespace afdo_detail + +class FSProfileLoader final + : public SampleProfileLoaderBaseImpl { +public: + void setInitVals(MachineDominatorTree *MDT_, MachinePostDominatorTree *MPDT_, + MachineLoopInfo *MLI_, MachineBlockFrequencyInfo *MBFI_, + MachineOptimizationRemarkEmitter *ORE_) { + DT = MDT_; + PDT = MPDT_; + LI = MLI_; + MBFI = MBFI_; + ORE = ORE_; + } + void setFSPass(FSDiscriminatorPass Pass) { + P = Pass; + LowBit = getFSPassBitBegin(P); + HighBit = getFSPassBitEnd(P); + assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); + } + + FSProfileLoader(StringRef Name) + : SampleProfileLoaderBaseImpl(std::string(Name)) {} + + void setBranchProbs(MachineFunction &F); + bool runOnFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool isValid() const { return ProfileIsValid; } + +protected: + friend class SampleCoverageTracker; + + /// Hold the information of the basic block frequency. + MachineBlockFrequencyInfo *MBFI; + + /// PassNum is the sequence number this pass is called, start from 1. + FSDiscriminatorPass P; + + // LowBit in the FS discriminator used by this instance. Note the number is + // 0-based. Base discrimnator use bit 0 to bit 11. + unsigned LowBit; + // HighwBit in the FS discriminator used by this instance. Note the number + // is 0-based. + unsigned HighBit; + + bool ProfileIsValid = true; +}; + +template <> +void SampleProfileLoaderBaseImpl< + MachineBasicBlock>::computeDominanceAndLoopInfo(MachineFunction &F) {} + +/// Build in/out edge lists for each basic block in the CFG. +/// +/// We are interested in unique edges. If a block B1 has multiple +/// edges to another block B2, we only add a single B1->B2 edge. +template <> +void SampleProfileLoaderBaseImpl::buildEdges(FunctionT &F) { + for (auto &BI : F) { + BasicBlockT *B1 = &BI; + + // Add predecessors for B1. + SmallPtrSet Visited; + if (!Predecessors[B1].empty()) + llvm_unreachable("Found a stale predecessors list in a basic block."); + for (auto *B2 : B1->predecessors()) { + if (Visited.insert(B2).second) + Predecessors[B1].push_back(B2); + } + + // Add successors for B1. + Visited.clear(); + if (!Successors[B1].empty()) + llvm_unreachable("Found a stale successors list in a basic block."); + for (auto *B2 : B1->successors()) { + if (Visited.insert(B2).second) + Successors[B1].push_back(B2); + } + } +} + +void FSProfileLoader::setBranchProbs(MachineFunction &F) { + LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n"); + for (auto &BI : F) { + MachineBasicBlock *BB = &BI; + if (BB->succ_size() < 2) + continue; + const MachineBasicBlock *EC = EquivalenceClass[BB]; + uint64_t BBWeight = BlockWeights[EC]; + uint64_t SumEdgeWeight = 0; + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + Edge E = std::make_pair(BB, Succ); + SumEdgeWeight += EdgeWeights[E]; + } + + if (BBWeight != SumEdgeWeight) { + LLVM_DEBUG(dbgs() << "BBweight is not equal to SumEdgeWeight: BBWWeight=" + << BBWeight << " SumEdgeWeight= " << SumEdgeWeight + << "\n"); + BBWeight = SumEdgeWeight; + } + if (BBWeight == 0) { + LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); + continue; + } + + uint64_t BBWeight_Orig = BBWeight; + uint32_t MaxWeight = std::numeric_limits::max(); + uint32_t Factor = 1; + if (BBWeight > MaxWeight) { + Factor = BBWeight / MaxWeight + 1; + BBWeight /= Factor; + LLVM_DEBUG(dbgs() << "Scaling weights by " << Factor << "\n"); + } + + if (ShowFSBranchProb) { + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + Edge E = std::make_pair(BB, Succ); + uint64_t EdgeWeight = EdgeWeights[E]; + EdgeWeight /= Factor; + + assert(BBWeight >= EdgeWeight && + "BBweight is larger than EdgeWeight -- should not happen.\n"); + + BranchProbability OldProb = MBFI->getMBPI()->getEdgeProbability(BB, SI); + BranchProbability NewProb(EdgeWeight, BBWeight); + if (OldProb == NewProb) + continue; + BB->setSuccProbability(SI, NewProb); + bool Show = false; + BranchProbability Diff; + if (OldProb > NewProb) + Diff = OldProb - NewProb; + else + Diff = NewProb - OldProb; + Show = + (Diff >= BranchProbability(FSProfileDebugProbDiffThreshold, 100)); + Show &= (BBWeight_Orig >= FSProfileDebugBWThreshold); + + auto DIL = BB->findBranchDebugLoc(); + auto SuccDIL = Succ->findBranchDebugLoc(); + if (Show) { + dbgs() << "Set branch fs prob: MBB (" << BB->getNumber() << " -> " + << Succ->getNumber() << "): "; + if (DIL) + dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" + << DIL->getColumn(); + if (SuccDIL) + dbgs() << "-->" << SuccDIL->getFilename() << ":" + << SuccDIL->getLine() << ":" << SuccDIL->getColumn(); + dbgs() << " W=" << BBWeight_Orig << " " << OldProb << " --> " + << NewProb << "\n"; + } + } + } + } +} + +bool FSProfileLoader::doInitialization(Module &M) { + auto &Ctx = M.getContext(); + + auto ReaderOrErr = + sampleprof::SampleProfileReader::create(Filename, Ctx, P, ""); + if (std::error_code EC = ReaderOrErr.getError()) { + std::string Msg = "Could not open profile: " + EC.message(); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); + return false; + } + + Reader = std::move(ReaderOrErr.get()); + Reader->setModule(&M); + ProfileIsValid = (Reader->read() == sampleprof_error::success); + Reader->getSummary(); + + return true; +} + +bool FSProfileLoader::runOnFunction(FunctionT &F) { + Function &Func = F.getFunction(); + clearFunctionData(false); + Samples = Reader->getSamplesFor(Func); + if (!Samples || Samples->empty()) + return false; + + if (getFunctionLoc(F) == 0) + return false; + + DenseSet InlinedGUIDs; + bool Changed = computeAndPropagateWeights(F, InlinedGUIDs); + + // Set the new BPI, BFI. + if (EnableFSBranchProb) + setBranchProbs(F); + + return Changed; +} + +} // namespace llvm + +bool FSProfileLoaderPass::runOnMachineFunction(MachineFunction &mf) { + if (DisableFSProfileLoader) + return false; + if (!FSSampleLoader->isValid()) + return false; + + MF = &mf; + LLVM_DEBUG(dbgs() << "FSProfileLoader pass working on Func: " + << MF->getFunction().getName() << "\n"); + MBFI = &getAnalysis(); + FSSampleLoader->setInitVals( + &getAnalysis(), + &getAnalysis(), &getAnalysis(), + MBFI, &getAnalysis().getORE()); + + MF->RenumberBlocks(); + if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + MF->getFunction().getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("FSP_b." + MF->getName(), false); + } + + bool Changed = FSSampleLoader->runOnFunction(mf); + + if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + MF->getFunction().getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("FSP_a." + MF->getName(), false); + } + + return Changed; +} + +bool FSProfileLoaderPass::doInitialization(Module &M) { + if (DisableFSProfileLoader) + return false; + LLVM_DEBUG(dbgs() << "FSProfileLoader pass working on Module " << M.getName() + << "\n"); + + FSSampleLoader->setFSPass(P); + return FSSampleLoader->doInitialization(M); +} + +void FSProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequiredTransitive(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -343,6 +343,7 @@ namespace llvm { extern cl::opt EnableFSDiscriminator; +extern cl::opt FSProfileFile; class PassConfigImpl { public: @@ -1072,6 +1073,12 @@ RegAlloc("regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator), cl::desc("Register allocator to use")); +static std::string getFSProfileFile() { + if (FSProfileFile.empty() || !EnableFSDiscriminator) + return std::string(); + return FSProfileFile.getValue(); +} + /// Add the complete set of target-independent postISel code generator passes. /// /// This can be read as the standard order of major LLVM CodeGen stages. Stages @@ -1113,6 +1120,14 @@ // where it becomes safe again so stop debugifying here. DebugifyIsSafe = false; + if (EnableFSDiscriminator) + addPass(createMIRAddFSDiscriminatorsPass( + sampleprof::FSDiscriminatorPass::Pass1)); + std::string FSFile = getFSProfileFile(); + if (!FSFile.empty()) + addPass(createFSProfileLoaderPass(FSFile, + sampleprof::FSDiscriminatorPass::Pass1)); + // Run register allocation and passes that are tightly coupled with it, // including phi elimination and scheduling. if (getOptimizeRegAlloc()) @@ -1465,6 +1480,13 @@ /// Add standard basic block placement passes. void TargetPassConfig::addBlockPlacement() { + if (EnableFSDiscriminator) + addPass(createMIRAddFSDiscriminatorsPass( + sampleprof::FSDiscriminatorPass::Pass2)); + std::string FSFile = getFSProfileFile(); + if (!FSFile.empty()) + addPass(createFSProfileLoaderPass(FSFile, + sampleprof::FSDiscriminatorPass::Pass2)); if (addPass(&MachineBlockPlacementID)) { // Run a separate pass to collect block placement statistics. if (EnableBlockPlacementStats) Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -310,6 +310,7 @@ extern cl::opt FlattenedProfileUsed; +extern cl::opt FSProfileFile; extern cl::opt AttributorRun; extern cl::opt EnableKnowledgeRetention; @@ -1094,6 +1095,7 @@ // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. if (LoadSampleProfile) EarlyFPM.addPass(InstCombinePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); if (LoadSampleProfile) { @@ -1107,13 +1109,17 @@ // Do not invoke ICP in the LTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the LTO backend. if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && - Phase != ThinOrFullLTOPhase::FullLTOPreLink) + Phase != ThinOrFullLTOPhase::FullLTOPreLink) { // We perform early indirect call promotion here, before globalopt. // This is important for the ThinLTO backend phase because otherwise // imported available_externally functions look unreferenced and are // removed. MPM.addPass( PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); + + // Set FSProfileFile so that CodeGen can read the profile. + FSProfileFile.setValue(PGOOpt->ProfileFile); + } } // Try to perform OpenMP specific optimizations on the module. This is a @@ -1677,6 +1683,9 @@ // Cache ProfileSummaryAnalysis once to avoid the potential need to insert // RequireAnalysisPass for PSI before subsequent non-module passes. MPM.addPass(RequireAnalysisPass()); + + // Set FSProfileFile so that CodeGen can read the profile. + FSProfileFile.setValue(PGOOpt->ProfileFile); } // Remove unused virtual tables to improve the quality of code generated by Index: llvm/lib/ProfileData/SampleProf.cpp =================================================================== --- llvm/lib/ProfileData/SampleProf.cpp +++ llvm/lib/ProfileData/SampleProf.cpp @@ -36,6 +36,11 @@ "will be used. This is very useful for performance debugging")); namespace llvm { + +cl::opt + FSProfileFile("fs-profile-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Flow Sensitive profile file name."), cl::Hidden); + namespace sampleprof { SampleProfileFormat FunctionSamples::Format; bool FunctionSamples::ProfileIsProbeBased = false; Index: llvm/test/CodeGen/X86/Inputs/fsloader.afdo =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/Inputs/fsloader.afdo @@ -0,0 +1,35 @@ +work:42380966:1346190 + 1: 1246499 + 5: 1246499 +foo:28798256:4267 + 0: 4267 + 2.1: 255999 + 4: 264627 bar:250018 + 4.512: 269485 bar:278102 + 4.4608: 280297 bar:280933 + 4.12288: 278916 bar:267752 + 5: 264627 + 5.4096: 269485 + 5.8192: 260670 + 5.8704: 278916 + 6: 11541 + 6.3584: 278916 work:284547 + 6.4096: 260670 work:249428 + 6.8704: 11541 + 7: 272442 + 7.512: 283590 + 7.4608: 234082 + 7.9728: 279149 + 8: 11541 + 8.11776: 283590 work:305061 + 8.12288: 279149 work:281368 + 8.13824: 234082 work:225786 + 10: 4050 +bar:9504180:1076805 + 2: 1056020 + 3: 1056020 +main:20360:0 + 0: 0 + 2.1: 4045 + 3: 4156 foo:4267 + 5: 0 Index: llvm/test/CodeGen/X86/fsafdo_test1.ll =================================================================== --- llvm/test/CodeGen/X86/fsafdo_test1.ll +++ llvm/test/CodeGen/X86/fsafdo_test1.ll @@ -3,8 +3,8 @@ ; Check that fs-afdo discriminators are generated. ; CHECK: .loc 1 7 3 is_stmt 0 discriminator 2 # foo.c:7:3 ; Check: .loc 1 9 5 is_stmt 1 discriminator 2 # foo.c:9:5 -; CHECK: .loc 1 9 5 is_stmt 0 discriminator 268435458 # foo.c:9:5 -; CHECK: .loc 1 7 3 is_stmt 1 discriminator 3892314114 # foo.c:7:3 +; CHECK: .loc 1 9 5 is_stmt 0 discriminator 11266 # foo.c:9:5 +; CHECK: .loc 1 7 3 is_stmt 1 discriminator 11266 # foo.c:7:3 ; Check that variable __llvm_fs_discriminator__ is generated. ; CHECK: .type __llvm_fs_discriminator__,@object # @__llvm_fs_discriminator__ ; CHECK: .section .rodata,"a",@progbits Index: llvm/test/CodeGen/X86/fsafdo_test2.ll =================================================================== --- llvm/test/CodeGen/X86/fsafdo_test2.ll +++ llvm/test/CodeGen/X86/fsafdo_test2.ll @@ -1,4 +1,7 @@ ; RUN: llc -enable-fs-discriminator < %s | FileCheck %s +; RUN: llvm-profdata merge --sample -profile-isfs -o %t.afdo %S/Inputs/fsloader.afdo +; RUN: llc -enable-fs-discriminator -fs-profile-file=%t.afdo -show-fs-branchprob < %s 2>&1 | FileCheck %s --check-prefix=LOADER +; ;; ;; C source code for the test (compiler at -O3): ;; // A test case for loop unroll. @@ -38,9 +41,9 @@ ;; ;; Check that fs-afdo discriminators are generated. ; CHECK: .loc 1 23 9 is_stmt 0 discriminator 1 # unroll.c:23:9 -; CHECK: .loc 1 23 9 is_stmt 0 discriminator 3892314113 # unroll.c:23:9 -; CHECK: .loc 1 23 9 is_stmt 0 discriminator 2818572289 # unroll.c:23:9 -; CHECK: .loc 1 23 9 is_stmt 0 discriminator 3623878657 # unroll.c:23:9 +; CHECK: .loc 1 23 9 is_stmt 0 discriminator 3585 # unroll.c:23:9 +; CHECK: .loc 1 23 9 is_stmt 0 discriminator 8705 # unroll.c:23:9 +; CHECK: .loc 1 23 9 is_stmt 0 discriminator 4097 # unroll.c:23:9 ;; ;; Check that variable __llvm_fs_discriminator__ is generated. ; CHECK: .type __llvm_fs_discriminator__,@object # @__llvm_fs_discriminator__ @@ -50,6 +53,25 @@ ; CHECK: .byte 1 ; CHECK: .size __llvm_fs_discriminator__, 1 +;; Check that new branch probs are generated. +; LOADER: Set branch fs prob: MBB (1 -> 3): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (1 -> 2): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (3 -> 5): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (3 -> 4): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (5 -> 8): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x021c112e / 0x80000000 = 1.65% +; LOADER: Set branch fs prob: MBB (5 -> 7): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7de3eed2 / 0x80000000 = 98.35% +; LOADER: Set branch fs prob: MBB (8 -> 10): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (8 -> 9): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (10 -> 12): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (10 -> 11): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (12 -> 14): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x02012507 / 0x80000000 = 1.57% +; LOADER: Set branch fs prob: MBB (12 -> 13): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x7dfedaf9 / 0x80000000 = 98.43% +; LOADER: Set branch fs prob: MBB (14 -> 16): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0a5856e1 / 0x80000000 = 8.08% +; LOADER: Set branch fs prob: MBB (14 -> 15): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x75a7a91f / 0x80000000 = 91.92% +; LOADER: Set branch fs prob: MBB (16 -> 18): unroll.c:24:11-->unroll.c:19:3 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x16588166 / 0x80000000 = 17.46% +; LOADER: Set branch fs prob: MBB (16 -> 17): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x69a77e9a / 0x80000000 = 82.54% + + target triple = "x86_64-unknown-linux-gnu" @sum = dso_local local_unnamed_addr global i32 0, align 4