Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -184,7 +184,8 @@ void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); void addPGOInstrPasses(legacy::PassManagerBase &MPM); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); - void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const; + void addInstructionCombiningPass(legacy::PassManagerBase &MPM, + bool UseProfileInfo = true) const; public: /// populateFunctionPassManager - This fills in the function pass manager, Index: include/llvm/Transforms/InstCombine/InstCombine.h =================================================================== --- include/llvm/Transforms/InstCombine/InstCombine.h +++ include/llvm/Transforms/InstCombine/InstCombine.h @@ -44,12 +44,14 @@ class InstructionCombiningPass : public FunctionPass { InstCombineWorklist Worklist; const bool ExpensiveCombines; + const bool UseProfileInfo; public: static char ID; // Pass identification, replacement for typeid - InstructionCombiningPass(bool ExpensiveCombines = true) - : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines) { + InstructionCombiningPass(bool ExpensiveCombines = true, bool UseProfileInfo = true) + : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), + UseProfileInfo(UseProfileInfo) { initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); } Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -130,7 +130,8 @@ // into: // %Z = add int 2, %X // -FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true); +FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true, + bool UseProfileInfo = true); //===----------------------------------------------------------------------===// // Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -224,9 +224,9 @@ } void PassManagerBuilder::addInstructionCombiningPass( - legacy::PassManagerBase &PM) const { + legacy::PassManagerBase &PM, bool UseProfileInfo) const { bool ExpensiveCombines = OptLevel > 2; - PM.add(createInstructionCombiningPass(ExpensiveCombines)); + PM.add(createInstructionCombiningPass(ExpensiveCombines, UseProfileInfo)); } void PassManagerBuilder::populateFunctionPassManager( @@ -270,7 +270,9 @@ MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass()); // Catch trivial redundancies MPM.add(createCFGSimplificationPass()); // Merge & remove BBs - MPM.add(createInstructionCombiningPass()); // Combine silly seq's + MPM.add(createInstructionCombiningPass( + /*ExpensiveCombines = */ true, + /*UseProfileInfo = */ false)); // Combine silly seq's addExtensionsToPM(EP_Peephole, MPM); } if (EnablePGOInstrGen) { @@ -446,7 +448,7 @@ MPM.add(createDeadArgEliminationPass()); // Dead argument elimination - addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE + addInstructionCombiningPass(MPM, /*UseProfileInfo = */ false); // Clean up after IPCP & DAE addExtensionsToPM(EP_Peephole, MPM); MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE } Index: lib/Transforms/InstCombine/InstCombineInternal.h =================================================================== --- lib/Transforms/InstCombine/InstCombineInternal.h +++ lib/Transforms/InstCombine/InstCombineInternal.h @@ -32,6 +32,8 @@ #define DEBUG_TYPE "instcombine" namespace llvm { +class BlockFrequencyInfo; +class ProfileSummaryInfo; class CallSite; class DataLayout; class DominatorTree; @@ -182,6 +184,7 @@ const bool MinimizeSize; /// Enable combines that trigger rarely but are costly in compiletime. const bool ExpensiveCombines; + const bool UseProfileInfo; AliasAnalysis *AA; @@ -190,6 +193,8 @@ TargetLibraryInfo &TLI; DominatorTree &DT; const DataLayout &DL; + BlockFrequencyInfo &BFI; + ProfileSummaryInfo *PSI; // Optional analyses. When non-null, these can both be used to do better // combining and will be updated to reflect any changes. @@ -199,12 +204,16 @@ public: InstCombiner(InstCombineWorklist &Worklist, BuilderTy *Builder, - bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, + bool MinimizeSize, bool ExpensiveCombines, + bool UseProfileInfo, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, - DominatorTree &DT, const DataLayout &DL, LoopInfo *LI) + DominatorTree &DT, const DataLayout &DL, + BlockFrequencyInfo &BFI, ProfileSummaryInfo *PSI, + LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), - ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), - DL(DL), LI(LI), MadeIRChange(false) {} + ExpensiveCombines(ExpensiveCombines), UseProfileInfo(UseProfileInfo), + AA(AA), AC(AC), TLI(TLI), DT(DT), + DL(DL), BFI(BFI), PSI(PSI), LI(LI), MadeIRChange(false) {} /// \brief Run the combiner over the entire worklist until it is empty. /// @@ -214,6 +223,8 @@ AssumptionCache &getAssumptionCache() const { return AC; } const DataLayout &getDataLayout() const { return DL; } + + const BlockFrequencyInfo &getBlockFrequencyInfo() const { return BFI; } DominatorTree &getDominatorTree() const { return DT; } Index: lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -15,7 +15,10 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" @@ -47,6 +50,66 @@ return false; } +/// isAllocaPointerAccessHot - Look at the uses of the pointer to an +/// alloca. If the use is hot and the count of the use is larger than +/// the point at which the alloca is made, it is better to keep this +/// array on the stack in PIE mode as accessing a const stack array is +/// cheaper, performance wise. This applies to x86_64 only. +static bool isAllocaPointerAccessHot(AllocaInst &AI, BlockFrequencyInfo &BFI, + ProfileSummaryInfo *PSI, const DataLayout &DL, + bool UseProfileInfo = true) { + // We only care about constant length allocations. + if (!isa(AI.getArraySize())) return false; + + // We only care about this when building as PIE for x86_64. + // TODO: Do we also need to check for code model? + Module *M = AI.getModule(); + Triple T = Triple(M->getTargetTriple()); + if (M->getPIELevel() == PIELevel::Default + || T.getArch() != Triple::ArchType::x86_64) + return false; + + // If profile information is not yet available at this point, early instance + // of InstCombine, return true conservatively so that this analysis can be + // done later when profile info is available. + if (!UseProfileInfo) { + return true; + } + + ConstantInt *CS = dyn_cast(AI.getArraySize()); + uint64_t TypeSize = DL.getTypeAllocSize(AI.getAllocatedType()); + APInt TotalSize = CS->getValue().zextOrSelf(128) * APInt(128, TypeSize); + + SmallVector ValuesToInspect; + ValuesToInspect.emplace_back(&AI); + + while (!ValuesToInspect.empty()) { + auto V = ValuesToInspect.pop_back_val(); + for (auto &U : V->uses()) { + auto *I = cast(U.getUser()); + BasicBlock *B = I->getParent(); + if (dyn_cast(I) && PSI->isHotBB(B, &BFI)) { + auto Count = BFI.getBlockProfileCount(B); + uint64_t CountVal = (Count) ? *Count : 0; + auto AllocaCount = + BFI.getBlockProfileCount(AI.getParent()); + uint64_t AllocaCountVal = (AllocaCount) ? *AllocaCount : 0; + if (CountVal > AllocaCountVal && !TotalSize.ugt(CountVal)) { + DEBUG(dbgs() << "Alloca with hot pointer access : " << AI << '\n'); + DEBUG(dbgs() << "Alloca size is " << TotalSize << + "bytes but is accessed " << CountVal << " times" << '\n'); + return true; + } + } + // uses of bit cast need to be checked. + if (isa(I) || isa(I)) { + ValuesToInspect.emplace_back(I); + } + } + } + return false; +} + /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived) /// pointer to an alloca. Ignore any reads of the pointer, return false if we /// see any stores or other unknown uses. If we see pointer arithmetic, keep @@ -379,7 +442,8 @@ } } - if (AI.getAlignment()) { + if (AI.getAlignment() && + !isAllocaPointerAccessHot(AI, BFI, PSI, DL, UseProfileInfo)) { // Check to see if this allocation is only modified by a memcpy/memmove from // a constant global whose alignment is equal to or exceeds that of the // allocation. If this is the case, we can change all users to use Index: lib/Transforms/InstCombine/InstructionCombining.cpp =================================================================== --- lib/Transforms/InstCombine/InstructionCombining.cpp +++ lib/Transforms/InstCombine/InstructionCombining.cpp @@ -42,6 +42,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" @@ -49,6 +50,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CFG.h" @@ -3120,7 +3122,9 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, + BlockFrequencyInfo &BFI, ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, + bool UseProfileInfo = true, LoopInfo *LI = nullptr) { auto &DL = F.getParent()->getDataLayout(); ExpensiveCombines |= EnableExpensiveCombines; @@ -3151,7 +3155,7 @@ bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines, - AA, AC, TLI, DT, DL, LI); + UseProfileInfo, AA, AC, TLI, DT, DL, BFI, PSI, LI); IC.MaxArraySizeForCombine = MaxArraySize; Changed |= IC.run(); @@ -3169,10 +3173,14 @@ auto &TLI = AM.getResult(F); auto *LI = AM.getCachedResult(F); + auto &BFI = AM.getResult(F); + + auto &MAM = AM.getResult(F).getManager(); + auto *PSI = MAM.getCachedResult(*F.getParent()); // FIXME: The AliasAnalysis is not yet supported in the new pass manager - if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, - ExpensiveCombines, LI)) + if (!combineInstructionsOverFunction(F, Worklist, nullptr, AC, TLI, DT, BFI, + PSI, ExpensiveCombines, false, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3188,6 +3196,8 @@ AU.setPreservesCFG(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -3205,13 +3215,15 @@ auto &AC = getAnalysis().getAssumptionCache(F); auto &TLI = getAnalysis().getTLI(); auto &DT = getAnalysis().getDomTree(); + auto &BFI = getAnalysis().getBFI(); + auto *PSI = getAnalysis().getPSI(); // Optional analyses. auto *LIWP = getAnalysisIfAvailable(); auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, - ExpensiveCombines, LI); + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, BFI, PSI, + ExpensiveCombines, UseProfileInfo, LI); } char InstructionCombiningPass::ID = 0; @@ -3220,6 +3232,8 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine", @@ -3234,6 +3248,7 @@ initializeInstructionCombiningPassPass(*unwrap(R)); } -FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) { - return new InstructionCombiningPass(ExpensiveCombines); +FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines, + bool UseProfileInfo) { + return new InstructionCombiningPass(ExpensiveCombines, UseProfileInfo); } Index: test/Transforms/PGOProfile/Inputs/x86_64-pie-alloca.proftext =================================================================== --- test/Transforms/PGOProfile/Inputs/x86_64-pie-alloca.proftext +++ test/Transforms/PGOProfile/Inputs/x86_64-pie-alloca.proftext @@ -0,0 +1,19 @@ +# IR level Instrumentation Flag +:ir +_Z3usei +# Func Hash: +12884901887 +# Num Counters: +1 +# Counter Values: +100 + +_Z3foov +# Func Hash: +34137660316 +# Num Counters: +2 +# Counter Values: +100 +1 + Index: test/Transforms/PGOProfile/x86_64-pie-alloca.ll =================================================================== --- test/Transforms/PGOProfile/x86_64-pie-alloca.ll +++ test/Transforms/PGOProfile/x86_64-pie-alloca.ll @@ -0,0 +1,64 @@ +; Test that alloca of const arrays are retained instead of converting them +; to a global when their use is hot +; RUN: llvm-profdata merge %S/Inputs/x86_64-pie-alloca.proftext -o %t.profdata +; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-linux-gnu" + +@_ZZ3foovE3arr = private unnamed_addr constant [2 x i32] [i32 1, i32 2], align 4 + +; Function Attrs: noinline uwtable +define i32 @_Z3foov() { + +;CHECK: entry: +;CHECK-NEXT: %arr = alloca i64, align 8 +;CHECK: store i64 8589934593, i64* %arr, align 8 +;CHECK: for.cond: + +entry: + %arr = alloca [2 x i32], align 4 + %i = alloca i32, align 4 + %0 = bitcast [2 x i32]* %arr to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast ([2 x i32]* @_ZZ3foovE3arr to i8*), i64 8, i32 4, i1 false) + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 100 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %idxprom = sext i32 %rem to i64 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %arr, i64 0, i64 %idxprom + %3 = load i32, i32* %arrayidx, align 4 + %call = call i32 @_Z3usei(i32 %3) + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret i32 0 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) + +declare i32 @_Z3usei(i32) + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{i32 1, !"PIE Level", i32 2} + +; CHECK-DAG: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}} +; CHECK-DAG: {{![0-9]+}} = !{!"NumFunctions", i64 2} +; CHECK-DAG: {{![0-9]+}} = !{!"MaxFunctionCount", i64 100} +; CHECK-DAG: {{![0-9]+}} = !{!"branch_weights", i32 100, i32 1}