Index: llvm/include/llvm/CodeGen/MIRSampleProfile.h =================================================================== --- /dev/null +++ llvm/include/llvm/CodeGen/MIRSampleProfile.h @@ -0,0 +1,81 @@ +//===----- MIRSampleProfile.h: SampleFDO Support in MIR ---*- c++ -*-------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the supoorting functions for machine level Sample FDO +// loader. This is used in Flow Sensitive SampelFDO. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H +#define LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H + +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" + +#include + +namespace llvm { + +using namespace sampleprof; + +class MIRProfileLoader; +class MIRProfileLoaderPass : public MachineFunctionPass { + MachineFunction *MF; + std::string ProfileFileName; + FSDiscriminatorPass P; + unsigned LowBit; + unsigned HighBit; + +public: + static char ID; + /// FS bits will only use the '1' bits in the Mask. + MIRProfileLoaderPass(std::string FileName = "", + std::string RemappingFileName = "", + FSDiscriminatorPass P = FSDiscriminatorPass::Pass1) + : MachineFunctionPass(ID), ProfileFileName(FileName), P(P), + MIRSampleLoader( + std::make_unique(FileName, RemappingFileName)) { + LowBit = getFSPassBitBegin(P); + HighBit = getFSPassBitEnd(P); + assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); + } + + /// getMachineFunction - Return the last machine function computed. + const MachineFunction *getMachineFunction() const { return MF; } + +private: + void init(MachineFunction &MF); + bool runOnMachineFunction(MachineFunction &) override; + bool doInitialization(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + std::unique_ptr MIRSampleLoader; + /// Hold the information of the basic block frequency. + MachineBlockFrequencyInfo *MBFI; +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_FLOWSENSITIVESAMPLEPROFILE_H Index: llvm/include/llvm/CodeGen/MachineDominators.h =================================================================== --- llvm/include/llvm/CodeGen/MachineDominators.h +++ llvm/include/llvm/CodeGen/MachineDominators.h @@ -112,6 +112,12 @@ return DT->dominates(A, B); } + void getDescendants(MachineBasicBlock *A, + SmallVectorImpl &Result) { + applySplitCriticalEdges(); + DT->getDescendants(A, Result); + } + bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const { applySplitCriticalEdges(); return DT->dominates(A, B); Index: llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h =================================================================== --- llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h +++ llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h @@ -118,6 +118,12 @@ : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis, PassName, RemarkName, Loc, MBB) {} + MachineOptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName, + const MachineInstr *MI) + : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis, + PassName, RemarkName, MI->getDebugLoc(), + MI->getParent()) {} + static bool classof(const DiagnosticInfo *DI) { return DI->getKind() == DK_MachineOptimizationRemarkAnalysis; } Index: llvm/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/include/llvm/CodeGen/Passes.h +++ llvm/include/llvm/CodeGen/Passes.h @@ -171,6 +171,9 @@ /// This pass adds flow sensitive discriminators. extern char &MIRAddFSDiscriminatorsID; + /// This pass reads flow sensitive profile. + extern char &MIRProfileLoaderPassID; + /// FastRegisterAllocation Pass - This pass register allocates as fast as /// possible. It is best suited for debug code where live ranges are short. /// @@ -513,6 +516,11 @@ FunctionPass * createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P); + /// Read Flow Sensitive Profile. + FunctionPass *createMIRProfileLoaderPass(std::string File, + std::string RemappingFile, + sampleprof::FSDiscriminatorPass P); + /// Creates MIR Debugify pass. \see MachineDebugify.cpp ModulePass *createDebugifyMachineModulePass(); Index: llvm/include/llvm/IR/DebugInfoMetadata.h =================================================================== --- llvm/include/llvm/IR/DebugInfoMetadata.h +++ llvm/include/llvm/IR/DebugInfoMetadata.h @@ -2212,7 +2212,8 @@ return getCopyIdentifierFromDiscriminator(getDiscriminator()); } -Optional DILocation::cloneWithBaseDiscriminator(unsigned D) const { +Optional +DILocation::cloneWithBaseDiscriminator(unsigned D) const { unsigned BD, DF, CI; if (EnableFSDiscriminator) { @@ -2230,7 +2231,8 @@ return None; } -Optional DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const { +Optional +DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const { assert(!EnableFSDiscriminator && "FSDiscriminator should not call this."); DF *= getDuplicationFactor(); Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -64,6 +64,7 @@ void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeADCELegacyPassPass(PassRegistry&); void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&); +void initializeAddFSDiscriminatorsPass(PassRegistry &); void initializeModuleAddressSanitizerLegacyPassPass(PassRegistry &); void initializeASanGlobalsMetadataWrapperPassPass(PassRegistry &); void initializeAddressSanitizerLegacyPassPass(PassRegistry &); @@ -183,6 +184,7 @@ void initializeGlobalsAAWrapperPassPass(PassRegistry&); void initializeGuardWideningLegacyPassPass(PassRegistry&); void initializeHardwareLoopsPass(PassRegistry&); +void initializeMIRProfileLoaderPassPass(PassRegistry &); void initializeMemProfilerLegacyPassPass(PassRegistry &); void initializeHotColdSplittingLegacyPassPass(PassRegistry&); void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &); Index: llvm/include/llvm/Passes/PassBuilder.h =================================================================== --- llvm/include/llvm/Passes/PassBuilder.h +++ llvm/include/llvm/Passes/PassBuilder.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/IR/PassManager.h" #include "llvm/Passes/OptimizationLevel.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Inliner.h" @@ -32,6 +33,9 @@ class TargetMachine; class ModuleSummaryIndex; +extern cl::opt FSProfileFile; +extern cl::opt FSRemappingFile; + /// A struct capturing PGO tunables. struct PGOOptions { enum PGOAction { NoAction, IRInstr, IRUse, SampleUse }; @@ -65,6 +69,12 @@ // PseudoProbeForProfiling needs to be true. assert(this->Action != NoAction || this->CSAction != NoCSAction || this->DebugInfoForProfiling || this->PseudoProbeForProfiling); + + // Set FSProfileFile and FSRemappingFile for SampleUse action. + if (this->Action == SampleUse) { + FSProfileFile.setValue(ProfileFile); + FSRemappingFile.setValue(ProfileRemappingFile); + } } std::string ProfileFile; std::string CSProfileGenFile; @@ -710,6 +720,6 @@ return false; } -} +} // namespace llvm #endif Index: llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h =================================================================== --- llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -56,15 +56,20 @@ using FunctionT = Function; using BlockFrequencyInfoT = BlockFrequencyInfo; using LoopT = Loop; - using LoopInfoT = LoopInfo; + using LoopInfoPtrT = std::unique_ptr; + using DominatorTreePtrT = std::unique_ptr; + using PostDominatorTreeT = PostDominatorTree; + using PostDominatorTreePtrT = std::unique_ptr; using OptRemarkEmitterT = OptimizationRemarkEmitter; using OptRemarkAnalysisT = OptimizationRemarkAnalysis; - using DominatorTreeT = DominatorTree; - using PostDominatorTreeT = PostDominatorTree; + using PredRangeT = pred_range; + using SuccRangeT = succ_range; static Function &getFunction(Function &F) { return F; } static const BasicBlock *getEntryBB(const Function *F) { return &F->getEntryBlock(); } + static pred_range getPredecessors(BasicBlock *BB) { return predecessors(BB); } + static succ_range getSuccessors(BasicBlock *BB) { return successors(BB); } }; } // end namespace afdo_detail @@ -76,7 +81,8 @@ template class SampleProfileLoaderBaseImpl { public: - SampleProfileLoaderBaseImpl(std::string Name) : Filename(Name) {} + SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName) + : Filename(Name), RemappingFilename(RemapName) {} void dump() { Reader->dump(); } using InstructionT = typename afdo_detail::IRTraits::InstructionT; @@ -85,14 +91,19 @@ typename afdo_detail::IRTraits::BlockFrequencyInfoT; using FunctionT = typename afdo_detail::IRTraits::FunctionT; using LoopT = typename afdo_detail::IRTraits::LoopT; - using LoopInfoT = typename afdo_detail::IRTraits::LoopInfoT; + using LoopInfoPtrT = typename afdo_detail::IRTraits::LoopInfoPtrT; + using DominatorTreePtrT = + typename afdo_detail::IRTraits::DominatorTreePtrT; + using PostDominatorTreePtrT = + typename afdo_detail::IRTraits::PostDominatorTreePtrT; + using PostDominatorTreeT = + typename afdo_detail::IRTraits::PostDominatorTreeT; using OptRemarkEmitterT = typename afdo_detail::IRTraits::OptRemarkEmitterT; using OptRemarkAnalysisT = typename afdo_detail::IRTraits::OptRemarkAnalysisT; - using DominatorTreeT = typename afdo_detail::IRTraits::DominatorTreeT; - using PostDominatorTreeT = - typename afdo_detail::IRTraits::PostDominatorTreeT; + using PredRangeT = typename afdo_detail::IRTraits::PredRangeT; + using SuccRangeT = typename afdo_detail::IRTraits::SuccRangeT; using BlockWeightMap = DenseMap; using EquivalenceClassMap = @@ -112,6 +123,12 @@ const BasicBlockT *getEntryBB(const FunctionT *F) { return afdo_detail::IRTraits::getEntryBB(F); } + PredRangeT getPredecessors(BasicBlockT *BB) { + return afdo_detail::IRTraits::getPredecessors(BB); + } + SuccRangeT getSuccessors(BasicBlockT *BB) { + return afdo_detail::IRTraits::getSuccessors(BB); + } unsigned getFunctionLoc(FunctionT &Func); virtual ErrorOr getInstWeight(const InstructionT &Inst); @@ -129,12 +146,11 @@ void findEquivalencesFor(BasicBlockT *BB1, ArrayRef Descendants, PostDominatorTreeT *DomTree); - void propagateWeights(FunctionT &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(FunctionT &F); bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount); - void clearFunctionData(); + void clearFunctionData(bool ResetDT = true); void computeDominanceAndLoopInfo(FunctionT &F); bool computeAndPropagateWeights(FunctionT &F, @@ -168,9 +184,9 @@ EquivalenceClassMap EquivalenceClass; /// Dominance, post-dominance and loop information. - std::unique_ptr DT; - std::unique_ptr PDT; - std::unique_ptr LI; + DominatorTreePtrT DT; + PostDominatorTreePtrT PDT; + LoopInfoPtrT LI; /// Predecessors for each basic block in the CFG. BlockEdgeMap Predecessors; @@ -190,6 +206,9 @@ /// Name of the profile file to load. std::string Filename; + /// Name of the profile remapping file to load. + std::string RemappingFilename; + /// Profile Summary Info computed from sample profile. ProfileSummaryInfo *PSI = nullptr; @@ -199,15 +218,17 @@ /// Clear all the per-function data used to load samples and propagate weights. template -void SampleProfileLoaderBaseImpl::clearFunctionData() { +void SampleProfileLoaderBaseImpl::clearFunctionData(bool ResetDT) { BlockWeights.clear(); EdgeWeights.clear(); VisitedBlocks.clear(); VisitedEdges.clear(); EquivalenceClass.clear(); - DT = nullptr; - PDT = nullptr; - LI = nullptr; + if (ResetDT) { + DT = nullptr; + PDT = nullptr; + LI = nullptr; + } Predecessors.clear(); Successors.clear(); CoverageTracker.clear(); @@ -475,7 +496,7 @@ // class by making BB2's equivalence class be BB1. DominatedBBs.clear(); DT->getDescendants(BB1, DominatedBBs); - findEquivalencesFor(BB1, DominatedBBs, PDT.get()); + findEquivalencesFor(BB1, DominatedBBs, &*PDT); LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1)); } @@ -692,7 +713,7 @@ SmallPtrSet Visited; if (!Predecessors[B1].empty()) llvm_unreachable("Found a stale predecessors list in a basic block."); - for (BasicBlockT *B2 : predecessors(B1)) + for (auto *B2 : getPredecessors(B1)) if (Visited.insert(B2).second) Predecessors[B1].push_back(B2); @@ -700,7 +721,7 @@ Visited.clear(); if (!Successors[B1].empty()) llvm_unreachable("Found a stale successors list in a basic block."); - for (BasicBlockT *B2 : successors(B1)) + for (auto *B2 : getSuccessors(B1)) if (Visited.insert(B2).second) Successors[B1].push_back(B2); } @@ -911,12 +932,12 @@ template void SampleProfileLoaderBaseImpl::computeDominanceAndLoopInfo( FunctionT &F) { - DT.reset(new DominatorTreeT); + DT.reset(new DominatorTree); DT->recalculate(F); PDT.reset(new PostDominatorTree(F)); - LI.reset(new LoopInfoT); + LI.reset(new LoopInfo); LI->analyze(*DT); } Index: llvm/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/lib/CodeGen/CMakeLists.txt +++ llvm/lib/CodeGen/CMakeLists.txt @@ -108,6 +108,7 @@ MachineTraceMetrics.cpp MachineVerifier.cpp MIRFSDiscriminator.cpp + MIRSampleProfile.cpp MIRYamlMapping.cpp ModuloSchedule.cpp MultiHazardRecognizer.cpp Index: llvm/lib/CodeGen/MIRSampleProfile.cpp =================================================================== --- /dev/null +++ llvm/lib/CodeGen/MIRSampleProfile.cpp @@ -0,0 +1,339 @@ +//===-------- MIRSampleProfile.cpp: MIRSampleFDO (For FSAFDO) -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides the implementation of the MIRSampleProfile loader, mainly +// for flow sensitive SampleFDO. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MIRSampleProfile.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" +#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" + +using namespace llvm; +using namespace sampleprof; +using namespace llvm::sampleprofutil; +using ProfileCount = Function::ProfileCount; + +#define DEBUG_TYPE "fs-profile-loader" + +static cl::opt EnableFSBranchProb( + "enable-fs-branchprob", cl::Hidden, cl::init(true), + cl::desc("Enable seting flow sensitive branch probabilities")); +static cl::opt ShowFSBranchProb( + "show-fs-branchprob", cl::Hidden, cl::init(false), + cl::desc("Print setting flow sensitive branch probabilities")); +static cl::opt FSProfileDebugProbDiffThreshold( + "fs-profile-debug-prob-diff-threshold", cl::init(10), + cl::desc("Only show debug message if the branch probility is greater than " + "this value (in percentage).")); + +static cl::opt FSProfileDebugBWThreshold( + "fs-profile-debug-bw-threshold", cl::init(10000), + cl::desc("Only show debug message if the source branch weight is greater " + " than this value.")); + +static cl::opt ViewBFIBefore("fs-viewbfi-before", cl::Hidden, + cl::init(false), + cl::desc("View BFI before MIR loader")); +static cl::opt ViewBFIAfter("fs-viewbfi-after", cl::Hidden, + cl::init(false), + cl::desc("View BFI after MIR loader")); + +char MIRProfileLoaderPass::ID = 0; + +INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE, + "Load MIR Sample Profile", + /* cfg = */ false, /* is_analysis = */ false) +INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) +INITIALIZE_PASS_END(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile", + /* cfg = */ false, /* is_analysis = */ false) + +char &llvm::MIRProfileLoaderPassID = MIRProfileLoaderPass::ID; + +FunctionPass *llvm::createMIRProfileLoaderPass(std::string File, + std::string RemappingFile, + FSDiscriminatorPass P) { + return new MIRProfileLoaderPass(File, RemappingFile, P); +} + +namespace llvm { + +// Internal option used to control BFI display only after MBP pass. +// Defined in CodeGen/MachineBlockFrequencyInfo.cpp: +// -view-block-layout-with-bfi={none | fraction | integer | count} +extern cl::opt ViewBlockLayoutWithBFI; + +// Command line option to specify the name of the function for CFG dump +// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= +extern cl::opt ViewBlockFreqFuncName; + +namespace afdo_detail { +template <> struct IRTraits { + using InstructionT = MachineInstr; + using BasicBlockT = MachineBasicBlock; + using FunctionT = MachineFunction; + using BlockFrequencyInfoT = MachineBlockFrequencyInfo; + using LoopT = MachineLoop; + using LoopInfoPtrT = MachineLoopInfo *; + using DominatorTreePtrT = MachineDominatorTree *; + using PostDominatorTreePtrT = MachinePostDominatorTree *; + using PostDominatorTreeT = MachinePostDominatorTree; + using OptRemarkEmitterT = MachineOptimizationRemarkEmitter; + using OptRemarkAnalysisT = MachineOptimizationRemarkAnalysis; + using PredRangeT = iterator_range::iterator>; + using SuccRangeT = iterator_range::iterator>; + static Function &getFunction(MachineFunction &F) { return F.getFunction(); } + static const MachineBasicBlock *getEntryBB(const MachineFunction *F) { + return GraphTraits::getEntryNode(F); + } + static PredRangeT getPredecessors(MachineBasicBlock *BB) { + return BB->predecessors(); + } + static SuccRangeT getSuccessors(MachineBasicBlock *BB) { + return BB->successors(); + } +}; +} // namespace afdo_detail + +class MIRProfileLoader final + : public SampleProfileLoaderBaseImpl { +public: + void setInitVals(MachineDominatorTree *MDT_, MachinePostDominatorTree *MPDT_, + MachineLoopInfo *MLI_, MachineBlockFrequencyInfo *MBFI_, + MachineOptimizationRemarkEmitter *ORE_) { + DT = MDT_; + PDT = MPDT_; + LI = MLI_; + MBFI = MBFI_; + ORE = ORE_; + } + void setFSPass(FSDiscriminatorPass Pass) { + P = Pass; + LowBit = getFSPassBitBegin(P); + HighBit = getFSPassBitEnd(P); + assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit"); + } + + MIRProfileLoader(StringRef Name, StringRef RemapName) + : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)) { + } + + void setBranchProbs(MachineFunction &F); + bool runOnFunction(MachineFunction &F); + bool doInitialization(Module &M); + bool isValid() const { return ProfileIsValid; } + +protected: + friend class SampleCoverageTracker; + + /// Hold the information of the basic block frequency. + MachineBlockFrequencyInfo *MBFI; + + /// PassNum is the sequence number this pass is called, start from 1. + FSDiscriminatorPass P; + + // LowBit in the FS discriminator used by this instance. Note the number is + // 0-based. Base discrimnator use bit 0 to bit 11. + unsigned LowBit; + // HighwBit in the FS discriminator used by this instance. Note the number + // is 0-based. + unsigned HighBit; + + bool ProfileIsValid = true; +}; + +template <> +void SampleProfileLoaderBaseImpl< + MachineBasicBlock>::computeDominanceAndLoopInfo(MachineFunction &F) {} + +void MIRProfileLoader::setBranchProbs(MachineFunction &F) { + LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n"); + for (auto &BI : F) { + MachineBasicBlock *BB = &BI; + if (BB->succ_size() < 2) + continue; + const MachineBasicBlock *EC = EquivalenceClass[BB]; + uint64_t BBWeight = BlockWeights[EC]; + uint64_t SumEdgeWeight = 0; + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + Edge E = std::make_pair(BB, Succ); + SumEdgeWeight += EdgeWeights[E]; + } + + if (BBWeight != SumEdgeWeight) { + LLVM_DEBUG(dbgs() << "BBweight is not equal to SumEdgeWeight: BBWWeight=" + << BBWeight << " SumEdgeWeight= " << SumEdgeWeight + << "\n"); + BBWeight = SumEdgeWeight; + } + if (BBWeight == 0) { + LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n"); + continue; + } + +#ifndef NDEBUG + uint64_t BBWeightOrig = BBWeight; +#endif + uint32_t MaxWeight = std::numeric_limits::max(); + uint32_t Factor = 1; + if (BBWeight > MaxWeight) { + Factor = BBWeight / MaxWeight + 1; + BBWeight /= Factor; + LLVM_DEBUG(dbgs() << "Scaling weights by " << Factor << "\n"); + } + + for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), + SE = BB->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + Edge E = std::make_pair(BB, Succ); + uint64_t EdgeWeight = EdgeWeights[E]; + EdgeWeight /= Factor; + + assert(BBWeight >= EdgeWeight && + "BBweight is larger than EdgeWeight -- should not happen.\n"); + + BranchProbability OldProb = MBFI->getMBPI()->getEdgeProbability(BB, SI); + BranchProbability NewProb(EdgeWeight, BBWeight); + if (OldProb == NewProb) + continue; + BB->setSuccProbability(SI, NewProb); +#ifndef NDEBUG + if (!ShowFSBranchProb) + continue; + bool Show = false; + BranchProbability Diff; + if (OldProb > NewProb) + Diff = OldProb - NewProb; + else + Diff = NewProb - OldProb; + Show = (Diff >= BranchProbability(FSProfileDebugProbDiffThreshold, 100)); + Show &= (BBWeightOrig >= FSProfileDebugBWThreshold); + + auto DIL = BB->findBranchDebugLoc(); + auto SuccDIL = Succ->findBranchDebugLoc(); + if (Show) { + dbgs() << "Set branch fs prob: MBB (" << BB->getNumber() << " -> " + << Succ->getNumber() << "): "; + if (DIL) + dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":" + << DIL->getColumn(); + if (SuccDIL) + dbgs() << "-->" << SuccDIL->getFilename() << ":" << SuccDIL->getLine() + << ":" << SuccDIL->getColumn(); + dbgs() << " W=" << BBWeightOrig << " " << OldProb << " --> " << NewProb + << "\n"; + } +#endif + } + } +} + +bool MIRProfileLoader::doInitialization(Module &M) { + auto &Ctx = M.getContext(); + + auto ReaderOrErr = sampleprof::SampleProfileReader::create(Filename, Ctx, P, + RemappingFilename); + if (std::error_code EC = ReaderOrErr.getError()) { + std::string Msg = "Could not open profile: " + EC.message(); + Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); + return false; + } + + Reader = std::move(ReaderOrErr.get()); + Reader->setModule(&M); + ProfileIsValid = (Reader->read() == sampleprof_error::success); + Reader->getSummary(); + + return true; +} + +bool MIRProfileLoader::runOnFunction(MachineFunction &MF) { + Function &Func = MF.getFunction(); + clearFunctionData(false); + Samples = Reader->getSamplesFor(Func); + if (!Samples || Samples->empty()) + return false; + + if (getFunctionLoc(MF) == 0) + return false; + + DenseSet InlinedGUIDs; + bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs); + + // Set the new BPI, BFI. + if (EnableFSBranchProb) + setBranchProbs(MF); + + return Changed; +} + +} // namespace llvm + +bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) { + if (!MIRSampleLoader->isValid()) + return false; + + LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Func: " + << MF.getFunction().getName() << "\n"); + MBFI = &getAnalysis(); + MIRSampleLoader->setInitVals( + &getAnalysis(), + &getAnalysis(), &getAnalysis(), + MBFI, &getAnalysis().getORE()); + + MF.RenumberBlocks(); + if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + MF.getFunction().getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("MIR_Prof_loader_b." + MF.getName(), false); + } + + bool Changed = MIRSampleLoader->runOnFunction(MF); + + if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None && + (ViewBlockFreqFuncName.empty() || + MF.getFunction().getName().equals(ViewBlockFreqFuncName))) { + MBFI->view("MIR_prof_loader_a." + MF.getName(), false); + } + + return Changed; +} + +bool MIRProfileLoaderPass::doInitialization(Module &M) { + LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Module " << M.getName() + << "\n"); + + MIRSampleLoader->setFSPass(P); + return MIRSampleLoader->doInitialization(M); +} + +void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequiredTransitive(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} Index: llvm/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/lib/CodeGen/TargetPassConfig.cpp +++ llvm/lib/CodeGen/TargetPassConfig.cpp @@ -172,6 +172,16 @@ FSNoFinalDiscrim("fs-no-final-discrim", cl::init(false), cl::Hidden, cl::desc("Do not insert FS-AFDO discriminators before " "emit.")); +// Disable MIRProfileLoader before RegAlloc. This is for for debugging and +// tuning purpose. +static cl::opt DisableRAFSProfileLoader( + "disable-ra-fsprofile-loader", cl::init(false), cl::Hidden, + cl::desc("Disable MIRProfileLoader before RegAlloc")); +// Disable MIRProfileLoader before BloackPlacement. This is for for debugging +// and tuning purpose. +static cl::opt DisableLayoutFSProfileLoader( + "disable-layout-fsprofile-loader", cl::init(false), cl::Hidden, + cl::desc("Disable MIRProfileLoader before BlockPlacement")); // Temporary option to allow experimenting with MachineScheduler as a post-RA // scheduler. Targets can "properly" enable this with @@ -343,6 +353,8 @@ namespace llvm { extern cl::opt EnableFSDiscriminator; +extern cl::opt FSProfileFile; +extern cl::opt FSRemappingFile; class PassConfigImpl { public: @@ -1115,9 +1127,14 @@ // Add a FSDiscriminator pass right before RA, so that we could get // more precise SampleFDO profile for RA. - if (EnableFSDiscriminator) + if (EnableFSDiscriminator) { addPass(createMIRAddFSDiscriminatorsPass( sampleprof::FSDiscriminatorPass::Pass1)); + if (!FSProfileFile.empty() && !DisableRAFSProfileLoader) + addPass(createMIRProfileLoaderPass( + FSProfileFile.getValue(), FSRemappingFile.getValue(), + sampleprof::FSDiscriminatorPass::Pass1)); + } // Run register allocation and passes that are tightly coupled with it, // including phi elimination and scheduling. @@ -1471,9 +1488,14 @@ /// Add standard basic block placement passes. void TargetPassConfig::addBlockPlacement() { - if (EnableFSDiscriminator) + if (EnableFSDiscriminator) { addPass(createMIRAddFSDiscriminatorsPass( sampleprof::FSDiscriminatorPass::Pass2)); + if (!FSProfileFile.empty() && !DisableLayoutFSProfileLoader) + addPass(createMIRProfileLoaderPass( + FSProfileFile.getValue(), FSRemappingFile.getValue(), + sampleprof::FSDiscriminatorPass::Pass2)); + } if (addPass(&MachineBlockPlacementID)) { // Run a separate pass to collect block placement statistics. if (EnableBlockPlacementStats) Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -1094,6 +1094,7 @@ // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. if (LoadSampleProfile) EarlyFPM.addPass(InstCombinePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); if (LoadSampleProfile) { @@ -1107,13 +1108,14 @@ // Do not invoke ICP in the LTOPrelink phase as it makes it hard // for the profile annotation to be accurate in the LTO backend. if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && - Phase != ThinOrFullLTOPhase::FullLTOPreLink) + Phase != ThinOrFullLTOPhase::FullLTOPreLink) { // We perform early indirect call promotion here, before globalopt. // This is important for the ThinLTO backend phase because otherwise // imported available_externally functions look unreferenced and are // removed. MPM.addPass( PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); + } } // Try to perform OpenMP specific optimizations on the module. This is a Index: llvm/lib/ProfileData/SampleProf.cpp =================================================================== --- llvm/lib/ProfileData/SampleProf.cpp +++ llvm/lib/ProfileData/SampleProf.cpp @@ -36,6 +36,14 @@ "will be used. This is very useful for performance debugging")); namespace llvm { + +cl::opt + FSProfileFile("fs-profile-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Flow Sensitive profile file name."), cl::Hidden); +cl::opt FSRemappingFile( + "fs-remapping-file", cl::init(""), cl::value_desc("filename"), + cl::desc("Flow Sensitive profile remapping file name."), cl::Hidden); + namespace sampleprof { SampleProfileFormat FunctionSamples::Format; bool FunctionSamples::ProfileIsProbeBased = false; Index: llvm/lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- llvm/lib/Transforms/IPO/SampleProfile.cpp +++ llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -358,16 +358,25 @@ std::function GetAssumptionCache, std::function GetTargetTransformInfo, std::function GetTLI) - : SampleProfileLoaderBaseImpl(std::string(Name)), + : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)), GetAC(std::move(GetAssumptionCache)), GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)), - RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {} + LTOPhase(LTOPhase) {} bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); bool runOnModule(Module &M, ModuleAnalysisManager *AM, ProfileSummaryInfo *_PSI, CallGraph *CG); protected: +#if 0 + PredRangeT getPredecessors(BasicBlockT *BB) { + return predecessors(BB); + } + SuccRangeT getSuccessors(BasicBlockT *BB) { + return successors(BB); + } +#endif + bool runOnFunction(Function &F, ModuleAnalysisManager *AM); bool emitAnnotations(Function &F); ErrorOr getInstWeight(const Instruction &I) override; @@ -417,9 +426,6 @@ /// Profile tracker for different context. std::unique_ptr ContextTracker; - /// Name of the profile remapping file to load. - std::string RemappingFilename; - /// Flag indicating whether input profile is context-sensitive bool ProfileIsCS = false; Index: llvm/test/CodeGen/X86/Inputs/fsloader.afdo =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/Inputs/fsloader.afdo @@ -0,0 +1,35 @@ +work:42380966:1346190 + 1: 1246499 + 5: 1246499 +foo:28798256:4267 + 0: 4267 + 2.1: 255999 + 4: 264627 bar:250018 + 4.512: 269485 bar:278102 + 4.4608: 280297 bar:280933 + 4.12288: 278916 bar:267752 + 5: 264627 + 5.4096: 269485 + 5.8192: 260670 + 5.8704: 278916 + 6: 11541 + 6.3584: 278916 work:284547 + 6.4096: 260670 work:249428 + 6.8704: 11541 + 7: 272442 + 7.512: 283590 + 7.4608: 234082 + 7.9728: 279149 + 8: 11541 + 8.11776: 283590 work:305061 + 8.12288: 279149 work:281368 + 8.13824: 234082 work:225786 + 10: 4050 +bar:9504180:1076805 + 2: 1056020 + 3: 1056020 +main:20360:0 + 0: 0 + 2.1: 4045 + 3: 4156 foo:4267 + 5: 0 Index: llvm/test/CodeGen/X86/fsafdo_test2.ll =================================================================== --- llvm/test/CodeGen/X86/fsafdo_test2.ll +++ llvm/test/CodeGen/X86/fsafdo_test2.ll @@ -1,4 +1,7 @@ ; RUN: llc -enable-fs-discriminator < %s | FileCheck %s +; RUN: llvm-profdata merge --sample -profile-isfs -o %t.afdo %S/Inputs/fsloader.afdo +; RUN: llc -enable-fs-discriminator -fs-profile-file=%t.afdo -show-fs-branchprob < %s 2>&1 | FileCheck %s --check-prefix=LOADER +; ;; ;; C source code for the test (compiler at -O3): ;; // A test case for loop unroll. @@ -50,6 +53,25 @@ ; CHECK: .byte 1 ; CHECK: .size __llvm_fs_discriminator__, 1 +;; Check that new branch probs are generated. +; LOADER: Set branch fs prob: MBB (1 -> 3): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (1 -> 2): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (3 -> 5): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (3 -> 4): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (5 -> 8): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x021c112e / 0x80000000 = 1.65% +; LOADER: Set branch fs prob: MBB (5 -> 7): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7de3eed2 / 0x80000000 = 98.35% +; LOADER: Set branch fs prob: MBB (8 -> 10): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x00000000 / 0x80000000 = 0.00% +; LOADER: Set branch fs prob: MBB (8 -> 9): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x80000000 / 0x80000000 = 100.00% +; LOADER: Set branch fs prob: MBB (10 -> 12): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93% +; LOADER: Set branch fs prob: MBB (10 -> 11): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07% +; LOADER: Set branch fs prob: MBB (12 -> 14): unroll.c:24:11-->unroll.c:22:11 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x02012507 / 0x80000000 = 1.57% +; LOADER: Set branch fs prob: MBB (12 -> 13): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x7dfedaf9 / 0x80000000 = 98.43% +; LOADER: Set branch fs prob: MBB (14 -> 16): unroll.c:22:11-->unroll.c:24:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x0a5856e1 / 0x80000000 = 8.08% +; LOADER: Set branch fs prob: MBB (14 -> 15): unroll.c:22:11 W=283590 0x40000000 / 0x80000000 = 50.00% --> 0x75a7a91f / 0x80000000 = 91.92% +; LOADER: Set branch fs prob: MBB (16 -> 18): unroll.c:24:11-->unroll.c:19:3 W=283590 0x30000000 / 0x80000000 = 37.50% --> 0x16588166 / 0x80000000 = 17.46% +; LOADER: Set branch fs prob: MBB (16 -> 17): unroll.c:24:11 W=283590 0x50000000 / 0x80000000 = 62.50% --> 0x69a77e9a / 0x80000000 = 82.54% + + target triple = "x86_64-unknown-linux-gnu" @sum = dso_local local_unnamed_addr global i32 0, align 4