Index: include/llvm/Analysis/CodeMetrics.h =================================================================== --- include/llvm/Analysis/CodeMetrics.h +++ include/llvm/Analysis/CodeMetrics.h @@ -16,10 +16,13 @@ #define LLVM_ANALYSIS_CODEMETRICS_H #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/CallSite.h" namespace llvm { +class AssumptionTracker; class BasicBlock; +class Loop; class Function; class Instruction; class DataLayout; @@ -85,7 +88,18 @@ NumInlineCandidates(0), NumVectorInsts(0), NumRets(0) {} /// \brief Add information about a block to the current state. - void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI); + void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI, + SmallPtrSetImpl &EphValues); + + /// \brief Collect a loop's ephemeral values (those used only by an assume + /// or similar intrinsics in the loop). + static void collectEphemeralValues(const Loop *L, AssumptionTracker *AT, + SmallPtrSetImpl &EphValues); + + /// \brief Collect a functions's ephemeral values (those used only by an + /// assume or similar intrinsics in the function). + static void collectEphemeralValues(const Function *L, AssumptionTracker *AT, + SmallPtrSetImpl &EphValues); }; } Index: include/llvm/Analysis/InlineCost.h =================================================================== --- include/llvm/Analysis/InlineCost.h +++ include/llvm/Analysis/InlineCost.h @@ -19,6 +19,7 @@ #include namespace llvm { +class AssumptionTracker; class CallSite; class DataLayout; class Function; @@ -100,6 +101,7 @@ /// \brief Cost analyzer used by inliner. class InlineCostAnalysis : public CallGraphSCCPass { const TargetTransformInfo *TTI; + AssumptionTracker *AT; public: static char ID; Index: lib/Analysis/CodeMetrics.cpp =================================================================== --- lib/Analysis/CodeMetrics.cpp +++ lib/Analysis/CodeMetrics.cpp @@ -11,23 +11,99 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Analysis/AssumptionTracker.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "code-metrics" using namespace llvm; +static void completeEphemeralValues(SmallVector &WorkSet, + SmallPtrSetImpl &EphValues) { + SmallPtrSet Visited; + + // Make sure that all of the items in WorkSet are in our EphValues set. + EphValues.insert(WorkSet.begin(), WorkSet.end()); + + // Note: We don't speculate PHIs here, so we'll miss instruction chains kept + // alive only by ephemeral values. + + while (!WorkSet.empty()) { + const Value *V = WorkSet.pop_back_val(); + if (!Visited.insert(V)) + continue; + + // If all uses of this value are ephemeral, then so is this value. + bool FoundNEUse = false; + for (const User *I : V->users()) + if (!EphValues.count(I)) { + FoundNEUse = true; + break; + } + + if (FoundNEUse) + continue; + + EphValues.insert(V); + DEBUG(dbgs() << "Ephemeral Value: " << *V << "\n"); + + if (const User *U = dyn_cast(V)) + for (const Value *J : U->operands()) { + if (isSafeToSpeculativelyExecute(J)) + WorkSet.push_back(J); + } + } +} + +// Find all ephemeral values. +void CodeMetrics::collectEphemeralValues(const Loop *L, AssumptionTracker *AT, + SmallPtrSetImpl &EphValues) { + SmallVector WorkSet; + + for (auto &I : AT->assumptions(L->getHeader()->getParent())) { + // Filter out call sites outside of the loop so we don't to a function's + // worth of work for each of its loops (and, in the common case, ephemeral + // values in the loop are likely due to @llvm.assume calls in the loop). + if (!L->contains(I->getParent())) + continue; + + WorkSet.push_back(I); + } + + completeEphemeralValues(WorkSet, EphValues); +} + +void CodeMetrics::collectEphemeralValues(const Function *F, AssumptionTracker *AT, + SmallPtrSetImpl &EphValues) { + SmallVector WorkSet; + + for (auto &I : AT->assumptions(const_cast(F))) + WorkSet.push_back(I); + + completeEphemeralValues(WorkSet, EphValues); +} + /// analyzeBasicBlock - Fill in the current structure with information gleaned /// from the specified block. void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + SmallPtrSetImpl &EphValues) { ++NumBlocks; unsigned NumInstsBeforeThisBB = NumInsts; for (BasicBlock::const_iterator II = BB->begin(), E = BB->end(); II != E; ++II) { + // Skip ephemeral values. + if (EphValues.count(II)) + continue; + // Special handling for calls. if (isa(II) || isa(II)) { ImmutableCallSite CS(cast(II)); Index: lib/Analysis/IPA/InlineCost.cpp =================================================================== --- lib/Analysis/IPA/InlineCost.cpp +++ lib/Analysis/IPA/InlineCost.cpp @@ -17,7 +17,9 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionTracker.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" @@ -49,6 +51,9 @@ /// The TargetTransformInfo available for this compilation. const TargetTransformInfo &TTI; + /// The cache of @llvm.assume intrinsics. + AssumptionTracker *AT; + // The called function. Function &F; @@ -104,7 +109,7 @@ ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); // Custom analysis routines. - bool analyzeBlock(BasicBlock *BB); + bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl &EphValues); // Disable several entry points to the visitor so we don't accidentally use // them by declaring but not defining them here. @@ -141,8 +146,8 @@ public: CallAnalyzer(const DataLayout *DL, const TargetTransformInfo &TTI, - Function &Callee, int Threshold) - : DL(DL), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0), + AssumptionTracker *AT, Function &Callee, int Threshold) + : DL(DL), TTI(TTI), AT(AT), F(Callee), Threshold(Threshold), Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), ExposesReturnsTwice(false), HasDynamicAlloca(false), ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), @@ -778,7 +783,7 @@ // during devirtualization and so we want to give it a hefty bonus for // inlining, but cap that bonus in the event that inlining wouldn't pan // out. Pretend to inline the function, with a custom threshold. - CallAnalyzer CA(DL, TTI, *F, InlineConstants::IndirectCallThreshold); + CallAnalyzer CA(DL, TTI, AT, *F, InlineConstants::IndirectCallThreshold); if (CA.analyzeCall(CS)) { // We were able to inline the indirect call! Subtract the cost from the // bonus we want to apply, but don't go below zero. @@ -881,7 +886,8 @@ /// aborts early if the threshold has been exceeded or an impossible to inline /// construct has been detected. It returns false if inlining is no longer /// viable, and true if inlining remains viable. -bool CallAnalyzer::analyzeBlock(BasicBlock *BB) { +bool CallAnalyzer::analyzeBlock(BasicBlock *BB, + SmallPtrSetImpl &EphValues) { for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { // FIXME: Currently, the number of instructions in a function regardless of // our ability to simplify them during inline to constants or dead code, @@ -893,6 +899,10 @@ if (isa(I)) continue; + // Skip ephemeral values. + if (EphValues.count(I)) + continue; + ++NumInstructions; if (isa(I) || I->getType()->isVectorTy()) ++NumVectorInstructions; @@ -1096,6 +1106,12 @@ NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); NumAllocaArgs = SROAArgValues.size(); + // FIXME: If a caller has multiple calls to a callee, we end up recomputing + // the ephemeral values multiple times (and they're completely determined by + // the callee, so this is purely duplicate work). + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(&F, AT, EphValues); + // The worklist of live basic blocks in the callee *after* inlining. We avoid // adding basic blocks of the callee which can be proven to be dead for this // particular call site in order to get more accurate cost estimates. This @@ -1129,7 +1145,7 @@ // Analyze the cost of this block. If we blow through the threshold, this // returns false, and we can bail on out. - if (!analyzeBlock(BB)) { + if (!analyzeBlock(BB, EphValues)) { if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || HasIndirectBr) return false; @@ -1217,6 +1233,7 @@ INITIALIZE_PASS_BEGIN(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", true, true) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) INITIALIZE_PASS_END(InlineCostAnalysis, "inline-cost", "Inline Cost Analysis", true, true) @@ -1228,12 +1245,14 @@ void InlineCostAnalysis::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); + AU.addRequired(); AU.addRequired(); CallGraphSCCPass::getAnalysisUsage(AU); } bool InlineCostAnalysis::runOnSCC(CallGraphSCC &SCC) { TTI = &getAnalysis(); + AT = &getAnalysis(); return false; } @@ -1290,7 +1309,7 @@ DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() << "...\n"); - CallAnalyzer CA(Callee->getDataLayout(), *TTI, *Callee, Threshold); + CallAnalyzer CA(Callee->getDataLayout(), *TTI, AT, *Callee, Threshold); bool ShouldInline = CA.analyzeCall(CS); DEBUG(CA.dump()); Index: lib/Transforms/Scalar/LoopRotation.cpp =================================================================== --- lib/Transforms/Scalar/LoopRotation.cpp +++ lib/Transforms/Scalar/LoopRotation.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionTracker.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopPass.h" @@ -53,6 +54,7 @@ // LCSSA form makes instruction renaming easier. void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); @@ -72,12 +74,14 @@ unsigned MaxHeaderSize; LoopInfo *LI; const TargetTransformInfo *TTI; + AssumptionTracker *AT; }; } char LoopRotate::ID = 0; INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -98,6 +102,7 @@ LI = &getAnalysis(); TTI = &getAnalysis(); + AT = &getAnalysis(); // Simplify the loop latch before attempting to rotate the header // upward. Rotation may not be needed if the loop tail can be folded into the @@ -323,8 +328,11 @@ // Check size of original header and reject loop if it is very big or we can't // duplicate blocks inside it. { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics Metrics; - Metrics.analyzeBasicBlock(OrigHeader, *TTI); + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); if (Metrics.notDuplicatable) { DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" << " instructions: "; L->dump()); Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -200,11 +200,15 @@ /// ApproximateLoopSize - Approximate the size of the loop. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, - const TargetTransformInfo &TTI) { + const TargetTransformInfo &TTI, + AssumptionTracker *AT) { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AT, EphValues); + CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI); + Metrics.analyzeBasicBlock(*I, TTI, EphValues); NumCalls = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; @@ -387,7 +391,7 @@ unsigned NumInlineCandidates; bool notDuplicatable; unsigned LoopSize = - ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI); + ApproximateLoopSize(L, NumInlineCandidates, notDuplicatable, TTI, AT); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); uint64_t UnrolledSize = (uint64_t)LoopSize * Count; if (notDuplicatable) { Index: lib/Transforms/Scalar/LoopUnswitch.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnswitch.cpp +++ lib/Transforms/Scalar/LoopUnswitch.cpp @@ -30,6 +30,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionTracker.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" @@ -103,7 +104,8 @@ // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. - bool countLoop(const Loop *L, const TargetTransformInfo &TTI); + bool countLoop(const Loop *L, const TargetTransformInfo &TTI, + AssumptionTracker *AT); // Clean all data related to given loop. void forgetLoop(const Loop *L); @@ -126,6 +128,7 @@ class LoopUnswitch : public LoopPass { LoopInfo *LI; // Loop information LPPassManager *LPM; + AssumptionTracker *AT; // LoopProcessWorklist - Used to check if second loop needs processing // after RewriteLoopBodyWithConditionConstant rewrites first loop. @@ -164,6 +167,7 @@ /// loop preheaders be inserted into the CFG. /// void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); AU.addRequiredID(LoopSimplifyID); AU.addPreservedID(LoopSimplifyID); AU.addRequired(); @@ -212,7 +216,8 @@ // Analyze loop. Check its size, calculate is it possible to unswitch // it. Returns true if we can unswitch this loop. -bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) { +bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI, + AssumptionTracker *AT) { LoopPropsMapIt PropsIt; bool Inserted; @@ -229,13 +234,16 @@ // large numbers of branches which cause loop unswitching to go crazy. // This is a very ad-hoc heuristic. + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AT, EphValues); + // FIXME: This is overly conservative because it does not take into // consideration code simplification opportunities and code that can // be shared by the resultant unswitched loops. CodeMetrics Metrics; for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) - Metrics.analyzeBasicBlock(*I, TTI); + Metrics.analyzeBasicBlock(*I, TTI, EphValues); Props.SizeEstimation = std::min(Metrics.NumInsts, Metrics.NumBlocks * 5); Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation); @@ -326,6 +334,7 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_DEPENDENCY(AssumptionTracker) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LoopInfo) INITIALIZE_PASS_DEPENDENCY(LCSSA) @@ -376,6 +385,7 @@ if (skipOptnoneFunction(L)) return false; + AT = &getAnalysis(); LI = &getAnalysis(); LPM = &LPM_Ref; DominatorTreeWrapperPass *DTWP = @@ -421,7 +431,8 @@ // Probably we reach the quota of branches for this loop. If so // stop unswitching. - if (!BranchesInfo.countLoop(currentLoop, getAnalysis())) + if (!BranchesInfo.countLoop(currentLoop, getAnalysis(), + AT)) return false; // Loop over all of the basic blocks in the loop. If we find an interior Index: test/Transforms/Inline/ephemeral.ll =================================================================== --- /dev/null +++ test/Transforms/Inline/ephemeral.ll @@ -0,0 +1,32 @@ +; RUN: opt -S -Oz %s | FileCheck %s + +@a = global i32 4 + +define i1 @inner() { + %a1 = load volatile i32* @a + %x1 = add i32 %a1, %a1 + %c = icmp eq i32 %x1, 0 + + ; Here are enough instructions to prevent inlining, but because they are used + ; only by the @llvm.assume intrinsic, they're free (and, thus, inlining will + ; still happen). + %a2 = mul i32 %a1, %a1 + %a3 = sub i32 %a1, 5 + %a4 = udiv i32 %a3, -13 + %a5 = mul i32 %a4, %a4 + %a6 = add i32 %a5, %x1 + %ca = icmp sgt i32 %a6, -7 + tail call void @llvm.assume(i1 %ca) + + ret i1 %c +} + +; @inner() should be inlined for -Oz. +; CHECK-NOT: call i1 @inner +define i1 @outer() optsize { + %r = call i1 @inner() + ret i1 %r +} + +declare void @llvm.assume(i1) nounwind + Index: test/Transforms/LoopUnroll/ephemeral.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/ephemeral.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -S -loop-unroll -unroll-threshold=50 | FileCheck %s + +; Make sure this loop is completely unrolled... +; CHECK-LABEL: @test1 +; CHECK: for.body: +; CHECK-NOT: for.end: + +define i32 @test1(i32* nocapture %a) nounwind uwtable readonly { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.01 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + + ; This loop will be completely unrolled, even with these extra instructions, + ; but only because they're ephemeral (and, thus, free). + %1 = add nsw i32 %0, 2 + %2 = add nsw i32 %1, 4 + %3 = add nsw i32 %2, 4 + %4 = add nsw i32 %3, 4 + %5 = add nsw i32 %4, 4 + %6 = add nsw i32 %5, 4 + %7 = add nsw i32 %6, 4 + %8 = add nsw i32 %7, 4 + %9 = add nsw i32 %8, 4 + %10 = add nsw i32 %9, 4 + %ca = icmp sgt i32 %10, -7 + call void @llvm.assume(i1 %ca) + + %add = add nsw i32 %0, %sum.01 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 5 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add +} + +declare void @llvm.assume(i1) nounwind +