Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -308,10 +308,10 @@ /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If /// the analysis failed (no benefits expected from the unrolling, or the loop is /// too big to analyze), the returned value is None. -static Optional -analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, - ScalarEvolution &SE, const TargetTransformInfo &TTI, - unsigned MaxUnrolledLoopSize) { +static Optional analyzeLoopUnrollCost( + const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE, + const SmallPtrSetImpl &EphValues, + const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) { // We want to be able to scale offsets by the trip count and add more offsets // to them without checking for overflows, and we already don't want to // analyze *massive* trip counts, so we force the max to be reasonably small. @@ -490,7 +490,9 @@ // it. We don't change the actual IR, just count optimization // opportunities. for (Instruction &I : *BB) { - if (isa(I)) + // These won't get into the final code - don't even try calculating the + // cost for them. + if (isa(I) || EphValues.count(&I)) continue; // Track this instruction's expected baseline cost when executing the @@ -607,13 +609,11 @@ } /// ApproximateLoopSize - Approximate the size of the loop. -static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls, - bool &NotDuplicatable, bool &Convergent, - const TargetTransformInfo &TTI, - AssumptionCache *AC, unsigned BEInsns) { - SmallPtrSet EphValues; - CodeMetrics::collectEphemeralValues(L, AC, EphValues); - +static unsigned +ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, + bool &Convergent, const TargetTransformInfo &TTI, + const SmallPtrSetImpl &EphValues, + unsigned BEInsns) { CodeMetrics Metrics; for (BasicBlock *BB : L->blocks()) Metrics.analyzeBasicBlock(BB, TTI, EphValues); @@ -708,8 +708,9 @@ // Calculates unroll count and writes it to UP.Count. static bool computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, - unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, + ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, + OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. @@ -779,7 +780,7 @@ // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, TTI, + L, FullUnrollTripCount, DT, SE, EphValues, TTI, UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); @@ -975,8 +976,13 @@ // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) return LoopUnrollResult::Unmodified; - unsigned LoopSize = ApproximateLoopSize( - L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns); + + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, &AC, EphValues); + + unsigned LoopSize = + ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent, + TTI, EphValues, UP.BEInsns); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); if (NotDuplicatable) { DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable" @@ -1040,9 +1046,9 @@ // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; - bool IsCountSetExplicitly = - computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, - TripMultiple, LoopSize, UP, UseUpperBound); + bool IsCountSetExplicitly = computeUnrollCount( + L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; // Unroll factor (Count) must be less or equal to TripCount. Index: test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -loop-unroll -unroll-threshold=42 | FileCheck %s --check-prefix=ANALYZE-FULL + +; This test is supposed to check that calls to @llvm.assume builtin are not +; prohibiting the analysis of full unroll profitability in case the cost of the +; unrolled loop (not acounting to any simplifications done by such unrolling) is +; higher than some threshold. +; +; Ensure that we indeed are testing this code path by verifying that the loop is +; not unrolled without such analysis: + +; RUN: opt -S < %s -loop-unroll -unroll-threshold=42 -unroll-max-iteration-count-to-analyze=2 \ +; RUN: | FileCheck %s --check-prefix=DONT-ANALYZE-FULL + +; Function Attrs: nounwind +declare void @llvm.assume(i1) #1 + +define i32 @foo(i32* %a) { +; ANALYZE-FULL-LABEL: @foo( +; ANALYZE-FULL-NEXT: entry: +; ANALYZE-FULL-NEXT: br label [[FOR_BODY:%.*]] +; ANALYZE-FULL: for.body: +; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE:%.*]], label [[FOR_NEXT:%.*]] +; ANALYZE-FULL: do_store: +; ANALYZE-FULL-NEXT: store i32 0, i32* [[A:%.*]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT]] +; ANALYZE-FULL: for.next: +; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_1:%.*]], label [[FOR_NEXT_1:%.*]] +; ANALYZE-FULL: do_store.1: +; ANALYZE-FULL-NEXT: [[GEP_1:%.*]] = getelementptr i32, i32* [[A]], i32 1 +; ANALYZE-FULL-NEXT: store i32 1, i32* [[GEP_1]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_1]] +; ANALYZE-FULL: for.next.1: +; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_2:%.*]], label [[FOR_NEXT_2:%.*]] +; ANALYZE-FULL: do_store.2: +; ANALYZE-FULL-NEXT: [[GEP_2:%.*]] = getelementptr i32, i32* [[A]], i32 2 +; ANALYZE-FULL-NEXT: store i32 2, i32* [[GEP_2]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_2]] +; ANALYZE-FULL: for.next.2: +; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_3:%.*]], label [[FOR_NEXT_3:%.*]] +; ANALYZE-FULL: do_store.3: +; ANALYZE-FULL-NEXT: [[GEP_3:%.*]] = getelementptr i32, i32* [[A]], i32 3 +; ANALYZE-FULL-NEXT: store i32 3, i32* [[GEP_3]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_3]] +; ANALYZE-FULL: for.next.3: +; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_4:%.*]], label [[FOR_NEXT_4:%.*]] +; ANALYZE-FULL: do_store.4: +; ANALYZE-FULL-NEXT: [[GEP_4:%.*]] = getelementptr i32, i32* [[A]], i32 4 +; ANALYZE-FULL-NEXT: store i32 4, i32* [[GEP_4]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_4]] +; ANALYZE-FULL: for.next.4: +; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_5:%.*]], label [[FOR_NEXT_5:%.*]] +; ANALYZE-FULL: do_store.5: +; ANALYZE-FULL-NEXT: [[GEP_5:%.*]] = getelementptr i32, i32* [[A]], i32 5 +; ANALYZE-FULL-NEXT: store i32 5, i32* [[GEP_5]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_5]] +; ANALYZE-FULL: for.next.5: +; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_6:%.*]], label [[FOR_NEXT_6:%.*]] +; ANALYZE-FULL: do_store.6: +; ANALYZE-FULL-NEXT: [[GEP_6:%.*]] = getelementptr i32, i32* [[A]], i32 6 +; ANALYZE-FULL-NEXT: store i32 6, i32* [[GEP_6]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_6]] +; ANALYZE-FULL: for.next.6: +; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_7:%.*]], label [[FOR_NEXT_7:%.*]] +; ANALYZE-FULL: do_store.7: +; ANALYZE-FULL-NEXT: [[GEP_7:%.*]] = getelementptr i32, i32* [[A]], i32 7 +; ANALYZE-FULL-NEXT: store i32 7, i32* [[GEP_7]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_7]] +; ANALYZE-FULL: for.next.7: +; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_8:%.*]], label [[FOR_NEXT_8:%.*]] +; ANALYZE-FULL: do_store.8: +; ANALYZE-FULL-NEXT: [[GEP_8:%.*]] = getelementptr i32, i32* [[A]], i32 8 +; ANALYZE-FULL-NEXT: store i32 8, i32* [[GEP_8]] +; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_8]] +; ANALYZE-FULL: for.next.8: +; ANALYZE-FULL-NEXT: ret i32 9 +; +; DONT-ANALYZE-FULL-LABEL: @foo( +; DONT-ANALYZE-FULL-NEXT: entry: +; DONT-ANALYZE-FULL-NEXT: br label [[FOR_BODY:%.*]] +; DONT-ANALYZE-FULL: for.body: +; DONT-ANALYZE-FULL-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_NEXT:%.*]] ] +; DONT-ANALYZE-FULL-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 +; DONT-ANALYZE-FULL-NEXT: [[CMP:%.*]] = icmp ule i32 [[INDVAR]], 20 +; DONT-ANALYZE-FULL-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; DONT-ANALYZE-FULL-NEXT: [[CMP2:%.*]] = icmp ule i32 [[INDVAR]], 3 +; DONT-ANALYZE-FULL-NEXT: br i1 [[CMP2]], label [[DO_STORE:%.*]], label [[FOR_NEXT]] +; DONT-ANALYZE-FULL: do_store: +; DONT-ANALYZE-FULL-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 [[INDVAR]] +; DONT-ANALYZE-FULL-NEXT: store i32 [[INDVAR]], i32* [[GEP]] +; DONT-ANALYZE-FULL-NEXT: br label [[FOR_NEXT]] +; DONT-ANALYZE-FULL: for.next: +; DONT-ANALYZE-FULL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INDVAR_NEXT]], 9 +; DONT-ANALYZE-FULL-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEXIT:%.*]] +; DONT-ANALYZE-FULL: loopexit: +; DONT-ANALYZE-FULL-NEXT: [[INDVAR_NEXT_LCSSA:%.*]] = phi i32 [ [[INDVAR_NEXT]], [[FOR_NEXT]] ] +; DONT-ANALYZE-FULL-NEXT: ret i32 [[INDVAR_NEXT_LCSSA]] +; +entry: + br label %for.body +for.body: + %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %for.next ] + %indvar.next = add i32 %indvar, 1 + %cmp = icmp ule i32 %indvar, 20 + tail call void @llvm.assume(i1 %cmp) + %cmp2 = icmp ule i32 %indvar, 3 + br i1 %cmp2, label %do_store, label %for.next + +do_store: + %gep = getelementptr i32, i32* %a, i32 %indvar + store i32 %indvar, i32* %gep + br label %for.next + +for.next: + %exitcond = icmp ne i32 %indvar.next, 9 + br i1 %exitcond, label %for.body, label %loopexit +loopexit: + ret i32 %indvar.next +}