diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -377,6 +377,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); } }; @@ -405,6 +406,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } }; @@ -437,6 +439,7 @@ "PGO instrumentation.", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen", "PGO instrumentation.", false, false) @@ -566,11 +569,11 @@ } FuncPGOInstrumentation( - Function &Func, + Function &Func, TargetLibraryInfo &TLI, std::unordered_multimap &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI), ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); @@ -834,15 +837,16 @@ // Visit all edge and instrument the edges not in MST, and do value profiling. // Critical edges will be split. static void instrumentOneFunc( - Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI, + Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI, + BlockFrequencyInfo *BFI, std::unordered_multimap &ComdatMembers, bool IsCS) { // Split indirectbr critical edges here before computing the MST rather than // later in getInstrBB() to avoid invalidating it. SplitIndirectBrCriticalEdges(F, BPI, BFI); - FuncPGOInstrumentation FuncInfo(F, ComdatMembers, true, BPI, - BFI, IsCS); + FuncPGOInstrumentation FuncInfo(F, TLI, ComdatMembers, true, + BPI, BFI, IsCS); std::vector InstrumentBBs; FuncInfo.getInstrumentBBs(InstrumentBBs); unsigned NumCounters = @@ -997,12 +1001,12 @@ class PGOUseFunc { public: - PGOUseFunc(Function &Func, Module *Modu, + PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI, std::unordered_multimap &ComdatMembers, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin, ProfileSummaryInfo *PSI, bool IsCS) : F(Func), M(Modu), BFI(BFIin), PSI(PSI), - FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS), + FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS), FreqAttr(FFA_Normal), IsCS(IsCS) {} // Read counts for the instrumented BB from profile. @@ -1504,7 +1508,8 @@ } static bool InstrumentAllFunctions( - Module &M, function_ref LookupBPI, + Module &M, function_ref LookupTLI, + function_ref LookupBPI, function_ref LookupBFI, bool IsCS) { // For the context-sensitve instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. @@ -1516,9 +1521,10 @@ for (auto &F : M) { if (F.isDeclaration()) continue; + auto &TLI = LookupTLI(F); auto *BPI = LookupBPI(F); auto *BFI = LookupBFI(F); - instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers, IsCS); + instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS); } return true; } @@ -1534,27 +1540,32 @@ if (skipModule(M)) return false; + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; auto LookupBPI = [this](Function &F) { return &this->getAnalysis(F).getBPI(); }; auto LookupBFI = [this](Function &F) { return &this->getAnalysis(F).getBFI(); }; - return InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS); + return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS); } PreservedAnalyses PGOInstrumentationGen::run(Module &M, ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; auto LookupBPI = [&FAM](Function &F) { return &FAM.getResult(F); }; - auto LookupBFI = [&FAM](Function &F) { return &FAM.getResult(F); }; - if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS)) + if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1562,6 +1573,7 @@ static bool annotateAllFunctions( Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName, + function_ref LookupTLI, function_ref LookupBPI, function_ref LookupBFI, ProfileSummaryInfo *PSI, bool IsCS) { @@ -1609,12 +1621,13 @@ for (auto &F : M) { if (F.isDeclaration()) continue; + auto &TLI = LookupTLI(F); auto *BPI = LookupBPI(F); auto *BFI = LookupBFI(F); // Split indirectbr critical edges here before computing the MST rather than // later in getInstrBB() to avoid invalidating it. SplitIndirectBrCriticalEdges(F, BPI, BFI); - PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS); + PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS); bool AllZeros = false; if (!Func.readCounters(PGOReader.get(), AllZeros)) continue; @@ -1695,10 +1708,12 @@ ModuleAnalysisManager &AM) { auto &FAM = AM.getResult(M).getManager(); + auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult(F); + }; auto LookupBPI = [&FAM](Function &F) { return &FAM.getResult(F); }; - auto LookupBFI = [&FAM](Function &F) { return &FAM.getResult(F); }; @@ -1706,7 +1721,7 @@ auto *PSI = &AM.getResult(M); if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, - LookupBPI, LookupBFI, PSI, IsCS)) + LookupTLI, LookupBPI, LookupBFI, PSI, IsCS)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1716,6 +1731,9 @@ if (skipModule(M)) return false; + auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis().getTLI(F); + }; auto LookupBPI = [this](Function &F) { return &this->getAnalysis(F).getBPI(); }; @@ -1724,8 +1742,8 @@ }; auto *PSI = &getAnalysis().getPSI(); - return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI, - IsCS); + return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI, + LookupBFI, PSI, IsCS); } static std::string getSimpleNodeName(const BasicBlock *Node) { diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp --- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -95,6 +95,11 @@ // This option sets the value that groups large memop sizes extern cl::opt MemOPSizeLarge; +static cl::opt + MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true), + cl::Hidden, + cl::desc("Size-specialize memcmp and bcmp calls")); + namespace { class PGOMemOPSizeOptLegacyPass : public FunctionPass { public: @@ -113,6 +118,7 @@ AU.addRequired(); AU.addPreserved(); AU.addPreserved(); + AU.addRequired(); } }; } // end anonymous namespace @@ -122,6 +128,7 @@ "Optimize memory intrinsic using its size value profile", false, false) INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt", "Optimize memory intrinsic using its size value profile", false, false) @@ -131,11 +138,90 @@ } namespace { + +static const char *getMIName(const MemIntrinsic *MI) { + switch (MI->getIntrinsicID()) { + case Intrinsic::memcpy: + return "memcpy"; + case Intrinsic::memmove: + return "memmove"; + case Intrinsic::memset: + return "memset"; + default: + return "unknown"; + } +} + +// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp). +struct MemOp { + Instruction *I; + MemOp(MemIntrinsic *MI) : I(MI) {} + MemOp(CallInst *CI) : I(CI) {} + MemIntrinsic *asMI() { return dyn_cast(I); } + CallInst *asCI() { return cast(I); } + MemOp clone() { + if (auto MI = asMI()) + return MemOp(cast(MI->clone())); + return MemOp(cast(asCI()->clone())); + } + Value *getLength() { + if (auto MI = asMI()) + return MI->getLength(); + return asCI()->getArgOperand(2); + } + void setLength(Value *Length) { + if (auto MI = asMI()) + return MI->setLength(Length); + asCI()->setArgOperand(2, Length); + } + StringRef getFuncName() { + if (auto MI = asMI()) + return MI->getCalledFunction()->getName(); + return asCI()->getCalledFunction()->getName(); + } + bool isMemmove() { + if (auto MI = asMI()) + if (MI->getIntrinsicID() == Intrinsic::memmove) + return true; + return false; + } + bool isMemcmp(TargetLibraryInfo &TLI) { + LibFunc Func; + if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && + Func == LibFunc_memcmp) { + return true; + } + return false; + } + bool isBcmp(TargetLibraryInfo &TLI) { + LibFunc Func; + if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) && + Func == LibFunc_bcmp) { + return true; + } + return false; + } + const char *getShortName(TargetLibraryInfo &TLI) { + if (auto MI = asMI()) + return getMIName(MI); + LibFunc Func; + if (TLI.getLibFunc(*asCI(), Func)) { + if (Func == LibFunc_memcmp) + return "memcmp"; + if (Func == LibFunc_bcmp) + return "bcmp"; + } + llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst"); + return nullptr; + } +}; + class MemOPSizeOpt : public InstVisitor { public: MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI, - OptimizationRemarkEmitter &ORE, DominatorTree *DT) - : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { + OptimizationRemarkEmitter &ORE, DominatorTree *DT, + TargetLibraryInfo &TLI) + : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) { ValueDataArray = std::make_unique(MemOPMaxVersion + 2); // Get the MemOPSize range information from option MemOPSizeRange, @@ -147,13 +233,12 @@ WorkList.clear(); visit(Func); - for (auto &MI : WorkList) { + for (auto &MO : WorkList) { ++NumOfPGOMemOPAnnotate; - if (perform(MI)) { + if (perform(MO)) { Changed = true; ++NumOfPGOMemOPOpt; - LLVM_DEBUG(dbgs() << "MemOP call: " - << MI->getCalledFunction()->getName() + LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName() << "is Transformed.\n"); } } @@ -164,7 +249,16 @@ // Not perform on constant length calls. if (dyn_cast(Length)) return; - WorkList.push_back(&MI); + WorkList.push_back(MemOp(&MI)); + } + + void visitCallInst(CallInst &CI) { + LibFunc Func; + if (TLI.getLibFunc(CI, Func) && + (Func == LibFunc_memcmp || Func == LibFunc_bcmp) && + !dyn_cast(CI.getArgOperand(2))) { + WorkList.push_back(MemOp(&CI)); + } } private: @@ -172,15 +266,16 @@ BlockFrequencyInfo &BFI; OptimizationRemarkEmitter &ORE; DominatorTree *DT; + TargetLibraryInfo &TLI; bool Changed; - std::vector WorkList; + std::vector WorkList; // Start of the previse range. int64_t PreciseRangeStart; // Last value of the previse range. int64_t PreciseRangeLast; // The space to read the profile annotation. std::unique_ptr ValueDataArray; - bool perform(MemIntrinsic *MI); + bool perform(MemOp MO); // This kind shows which group the value falls in. For PreciseValue, we have // the profile count for that value. LargeGroup groups the values that are in @@ -196,19 +291,6 @@ } }; -static const char *getMIName(const MemIntrinsic *MI) { - switch (MI->getIntrinsicID()) { - case Intrinsic::memcpy: - return "memcpy"; - case Intrinsic::memmove: - return "memmove"; - case Intrinsic::memset: - return "memset"; - default: - return "unknown"; - } -} - static bool isProfitable(uint64_t Count, uint64_t TotalCount) { assert(Count <= TotalCount); if (Count < MemOPCountThreshold) @@ -227,21 +309,23 @@ return ScaleCount / Denom; } -bool MemOPSizeOpt::perform(MemIntrinsic *MI) { - assert(MI); - if (MI->getIntrinsicID() == Intrinsic::memmove) +bool MemOPSizeOpt::perform(MemOp MO) { + assert(MO.I); + if (MO.isMemmove()) + return false; + if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI))) return false; uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2; uint64_t TotalCount; - if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions, + if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions, ValueDataArray.get(), NumVals, TotalCount)) return false; uint64_t ActualCount = TotalCount; uint64_t SavedTotalCount = TotalCount; if (MemOPScaleCount) { - auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent()); + auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent()); if (!BBEdgeCount) return false; ActualCount = *BBEdgeCount; @@ -333,13 +417,13 @@ // } // merge_bb: - BasicBlock *BB = MI->getParent(); + BasicBlock *BB = MO.I->getParent(); LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n"); LLVM_DEBUG(dbgs() << *BB << "\n"); auto OrigBBFreq = BFI.getBlockFreq(BB); - BasicBlock *DefaultBB = SplitBlock(BB, MI, DT); - BasicBlock::iterator It(*MI); + BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT); + BasicBlock::iterator It(*MO.I); ++It; assert(It != DefaultBB->end()); BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT); @@ -351,15 +435,24 @@ auto &Ctx = Func.getContext(); IRBuilder<> IRB(BB); BB->getTerminator()->eraseFromParent(); - Value *SizeVar = MI->getLength(); + Value *SizeVar = MO.getLength(); SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size()); + Type *MemOpTy = MO.I->getType(); + PHINode *PHI = nullptr; + if (!MemOpTy->isVoidTy()) { + // Insert a phi for the return values at the merge block. + IRBuilder<> IRBM(MergeBB->getFirstNonPHI()); + PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge"); + MO.I->replaceAllUsesWith(PHI); + PHI->addIncoming(MO.I, DefaultBB); + } // Clear the value profile data. - MI->setMetadata(LLVMContext::MD_prof, nullptr); + MO.I->setMetadata(LLVMContext::MD_prof, nullptr); // If all promoted, we don't need the MD.prof metadata. if (SavedRemainCount > 0 || Version != NumVals) // Otherwise we need update with the un-promoted records back. - annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version), + annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version), SavedRemainCount, IPVK_MemOPSize, NumVals); LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n"); @@ -371,17 +464,18 @@ for (uint64_t SizeId : SizeIds) { BasicBlock *CaseBB = BasicBlock::Create( Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); - Instruction *NewInst = MI->clone(); + MemOp NewMO = MO.clone(); // Fix the argument. - auto *MemI = cast(NewInst); - auto *SizeType = dyn_cast(MemI->getLength()->getType()); + auto *SizeType = dyn_cast(NewMO.getLength()->getType()); assert(SizeType && "Expected integer type size argument."); ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); - MemI->setLength(CaseSizeId); - CaseBB->getInstList().push_back(NewInst); + NewMO.setLength(CaseSizeId); + CaseBB->getInstList().push_back(NewMO.I); IRBuilder<> IRBCase(CaseBB); IRBCase.CreateBr(MergeBB); SI->addCase(CaseSizeId, CaseBB); + if (!MemOpTy->isVoidTy()) + PHI->addIncoming(NewMO.I, CaseBB); if (DT) { Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB}); Updates.push_back({DominatorTree::Insert, BB, CaseBB}); @@ -399,11 +493,11 @@ ORE.emit([&]() { using namespace ore; - return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI) - << "optimized " << NV("Intrinsic", StringRef(getMIName(MI))) - << " with count " << NV("Count", SumForOpt) << " out of " - << NV("Total", TotalCount) << " for " << NV("Versions", Version) - << " versions"; + return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I) + << "optimized " << NV("Memop", MO.getShortName(TLI)) + << " with count " << NV("Count", SumForOpt) << " out of " + << NV("Total", TotalCount) << " for " << NV("Versions", Version) + << " versions"; }); return true; @@ -412,13 +506,13 @@ static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI, OptimizationRemarkEmitter &ORE, - DominatorTree *DT) { + DominatorTree *DT, TargetLibraryInfo &TLI) { if (DisableMemOPOPT) return false; if (F.hasFnAttribute(Attribute::OptimizeForSize)) return false; - MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT); + MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI); MemOPSizeOpt.perform(); return MemOPSizeOpt.isChanged(); } @@ -429,7 +523,9 @@ auto &ORE = getAnalysis().getORE(); auto *DTWP = getAnalysisIfAvailable(); DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr; - return PGOMemOPSizeOptImpl(F, BFI, ORE, DT); + TargetLibraryInfo &TLI = + getAnalysis().getTLI(F); + return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); } namespace llvm { @@ -440,7 +536,8 @@ auto &BFI = FAM.getResult(F); auto &ORE = FAM.getResult(F); auto *DT = FAM.getCachedResult(F); - bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT); + auto &TLI = FAM.getResult(F); + bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI); if (!Changed) return PreservedAnalyses::all(); auto PA = PreservedAnalyses(); diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h --- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -16,6 +16,7 @@ #ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -58,7 +59,7 @@ Instruction *AnnotatedInst; // Where metadata is attached. }; - ValueProfileCollector(Function &Fn); + ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI); ValueProfileCollector(ValueProfileCollector &&) = delete; ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp --- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp +++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -38,7 +38,7 @@ template <> class PluginChain<> { public: - PluginChain(Function &F) {} + PluginChain(Function &F, TargetLibraryInfo &TLI) {} void get(InstrProfValueKind K, std::vector &Candidates) {} }; @@ -48,7 +48,8 @@ using Base = PluginChain; public: - PluginChain(Function &F) : PluginChain(F), Plugin(F) {} + PluginChain(Function &F, TargetLibraryInfo &TLI) + : PluginChain(F, TLI), Plugin(F, TLI) {} void get(InstrProfValueKind K, std::vector &Candidates) { if (K == PluginT::Kind) @@ -65,8 +66,9 @@ using PluginChainFinal::PluginChainFinal; }; -ValueProfileCollector::ValueProfileCollector(Function &F) - : PImpl(new ValueProfileCollectorImpl(F)) {} +ValueProfileCollector::ValueProfileCollector(Function &F, + TargetLibraryInfo &TLI) + : PImpl(new ValueProfileCollectorImpl(F, TLI)) {} ValueProfileCollector::~ValueProfileCollector() = default; diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc --- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc +++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -23,12 +23,14 @@ ///--------------------------- MemIntrinsicPlugin ------------------------------ class MemIntrinsicPlugin : public InstVisitor { Function &F; + TargetLibraryInfo &TLI; std::vector *Candidates; public: static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; - MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI) + : F(Fn), TLI(TLI), Candidates(nullptr) {} void run(std::vector &Cs) { Candidates = &Cs; @@ -45,6 +47,22 @@ Instruction *AnnotatedInst = &MI; Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); } + void visitCallInst(CallInst &CI) { + auto *F = CI.getCalledFunction(); + if (!F) + return; + LibFunc Func; + if (TLI.getLibFunc(CI, Func) && + (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) { + Value *Length = CI.getArgOperand(2); + // Not instrument constant length calls. + if (dyn_cast(Length)) + return; + Instruction *InsertPt = &CI; + Instruction *AnnotatedInst = &CI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } + } }; ///------------------------ IndirectCallPromotionPlugin ------------------------ @@ -54,7 +72,7 @@ public: static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; - IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {} void run(std::vector &Candidates) { std::vector Result = findIndirectCalls(F); diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext --- a/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext +++ b/llvm/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext @@ -14,7 +14,27 @@ # ValueKind = IPVK_MemOPSize: 1 # NumValueSites: -1 +3 +9 +7:33 +2:88 +9:72 +4:66 +1:99 +5:55 +6:44 +3:77 +8:22 +9 +7:33 +2:88 +9:72 +4:66 +1:99 +5:55 +6:44 +3:77 +8:22 9 7:33 2:88 diff --git a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll --- a/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll +++ b/llvm/test/Transforms/PGOProfile/memop_size_annotation.ll @@ -33,6 +33,12 @@ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false) ; MEMOP_ANNOTATION: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false) ; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE:[0-9]+]] + %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION: call i32 @memcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]] + %bcmp = call i32 @bcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION: call i32 @bcmp(i8* %dst, i8* %src, i64 %conv) +; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE]] ; MEMOP_ANNOTATION9: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22} ; MEMOP_ANNOTATION4: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72} br label %for.inc @@ -56,4 +62,7 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + declare void @llvm.lifetime.end(i64, i8* nocapture) diff --git a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll --- a/llvm/test/Transforms/PGOProfile/memop_size_opt.ll +++ b/llvm/test/Transforms/PGOProfile/memop_size_opt.ll @@ -57,12 +57,6 @@ ; MEMOP_OPT: br label %[[MERGE_LABEL2]] ; MEMOP_OPT: [[MERGE_LABEL2]]: ; MEMOP_OPT: br label %for.inc -; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99} -; Should be 457 total left (original total count 556, minus 99 from specialized -; value 1, which is removed from VP array. Also, we only end up with 5 total -; values, since the default max number of promotions is 5 and therefore -; the rest of the values are ignored when extracting the VP metadata. -; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66} for.inc: %inc = add nsw i32 %j.0, 1 @@ -79,6 +73,83 @@ ret void } +declare void @consume(i32 %v1, i32 %v2) + +define void @foo_memcmp_bcmp(i8* %dst, i8* %src, i8* %dst2, i8* %src2, i32* %a, i32 %n) !prof !27 { +entry: + br label %for.cond + +for.cond: + %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ] + %cmp = icmp slt i32 %i.0, %n + br i1 %cmp, label %for.body, label %for.end6, !prof !28 + +for.body: + br label %for.cond1 + +for.cond1: + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ] + %idx.ext = sext i32 %i.0 to i64 + %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext + %0 = load i32, i32* %add.ptr, align 4 + %cmp2 = icmp slt i32 %j.0, %0 + br i1 %cmp2, label %for.body3, label %for.end, !prof !29 + +for.body3: + %add = add nsw i32 %i.0, 1 + %conv = sext i32 %add to i64 + %memcmp = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof !30 + %bcmp = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof !31 + call void @consume(i32 %memcmp, i32 %bcmp) + br label %for.inc + +; MEMOP_OPT: switch i64 %conv, label %[[DEFAULT_LABEL:.*]] [ +; MEMOP_OPT: i64 1, label %[[CASE_1_LABEL:.*]] +; MEMOP_OPT: ], !prof [[SWITCH_BW:![0-9]+]] +; MEMOP_OPT: [[CASE_1_LABEL]]: +; MEMOP_OPT: %[[RV:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 1) +; MEMOP_OPT: br label %[[MERGE_LABEL:.*]] +; MEMOP_OPT: [[DEFAULT_LABEL]]: +; MEMOP_OPT: %[[RVD:.*]] = call i32 @memcmp(i8* %dst, i8* %src, i64 %conv), !prof [[NEWVP:![0-9]+]] +; MEMOP_OPT: br label %[[MERGE_LABEL]] +; MEMOP_OPT: [[MERGE_LABEL]]: +; MEMOP_OPT: %[[PHI:.*]] = phi i32 [ %[[RVD]], %[[DEFAULT_LABEL]] ], [ %[[RV]], %[[CASE_1_LABEL]] ] +; MEMOP_OPT: switch i64 %conv, label %[[DEFAULT_LABEL2:.*]] [ +; MEMOP_OPT: i64 1, label %[[CASE_1_LABEL2:.*]] +; MEMOP_OPT: ], !prof [[SWITCH_BW:![0-9]+]] +; MEMOP_OPT: [[CASE_1_LABEL2]]: +; MEMOP_OPT: %[[RV2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 1) +; MEMOP_OPT: br label %[[MERGE_LABEL2:.*]] +; MEMOP_OPT: [[DEFAULT_LABEL2]]: +; MEMOP_OPT: %[[RVD2:.*]] = call i32 @bcmp(i8* %dst2, i8* %src2, i64 %conv), !prof [[NEWVP]] +; MEMOP_OPT: br label %[[MERGE_LABEL2]] +; MEMOP_OPT: [[MERGE_LABEL2]]: +; MEMOP_OPT: %[[PHI2:.*]] = phi i32 [ %[[RVD2]], %[[DEFAULT_LABEL2]] ], [ %[[RV2]], %[[CASE_1_LABEL2]] ] +; MEMOP_OPT: call void @consume(i32 %[[PHI]], i32 %[[PHI2]]) +; MEMOP_OPT: br label %for.inc + +for.inc: + %inc = add nsw i32 %j.0, 1 + br label %for.cond1 + +for.end: + br label %for.inc4 + +for.inc4: + %inc5 = add nsw i32 %i.0, 1 + br label %for.cond + +for.end6: + ret void +} + +; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99} +; Should be 457 total left (original total count 556, minus 99 from specialized +; value 1, which is removed from VP array. Also, we only end up with 5 total +; values, since the default max number of promotions is 5 and therefore +; the rest of the values are ignored when extracting the VP metadata. +; MEMOP_OPT: [[NEWVP]] = !{!"VP", i32 1, i64 457, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66} + !llvm.module.flags = !{!0} !0 = !{i32 1, !"ProfileSummary", !1} @@ -118,6 +189,9 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) +declare i32 @memcmp(i8*, i8*, i64) +declare i32 @bcmp(i8*, i8*, i64) + declare void @llvm.lifetime.end(i64, i8* nocapture) ; YAML: --- !Passed @@ -127,7 +201,7 @@ ; YAML-NEXT: Hotness: 0 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'optimized ' -; YAML-NEXT: - Intrinsic: memcpy +; YAML-NEXT: - Memop: memcpy ; YAML-NEXT: - String: ' with count ' ; YAML-NEXT: - Count: '99' ; YAML-NEXT: - String: ' out of ' @@ -143,7 +217,39 @@ ; YAML-NEXT: Hotness: 0 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'optimized ' -; YAML-NEXT: - Intrinsic: memcpy +; YAML-NEXT: - Memop: memcpy +; YAML-NEXT: - String: ' with count ' +; YAML-NEXT: - Count: '99' +; YAML-NEXT: - String: ' out of ' +; YAML-NEXT: - Total: '556' +; YAML-NEXT: - String: ' for ' +; YAML-NEXT: - Versions: '1' +; YAML-NEXT: - String: ' versions' +; YAML-NEXT: ... +; YAML-NEXT: --- !Passed +; YAML-NEXT: Pass: pgo-memop-opt +; YAML-NEXT: Name: memopt-opt +; YAML-NEXT: Function: foo_memcmp_bcmp +; YAML-NEXT: Hotness: 0 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'optimized ' +; YAML-NEXT: - Memop: memcmp +; YAML-NEXT: - String: ' with count ' +; YAML-NEXT: - Count: '99' +; YAML-NEXT: - String: ' out of ' +; YAML-NEXT: - Total: '556' +; YAML-NEXT: - String: ' for ' +; YAML-NEXT: - Versions: '1' +; YAML-NEXT: - String: ' versions' +; YAML-NEXT: ... +; YAML-NEXT: --- !Passed +; YAML-NEXT: Pass: pgo-memop-opt +; YAML-NEXT: Name: memopt-opt +; YAML-NEXT: Function: foo_memcmp_bcmp +; YAML-NEXT: Hotness: 0 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'optimized ' +; YAML-NEXT: - Memop: bcmp ; YAML-NEXT: - String: ' with count ' ; YAML-NEXT: - Count: '99' ; YAML-NEXT: - String: ' out of '