Index: llvm/lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,11 @@ return true; } + // If we have a loop invariant computation, we only need to compute it once. + // Given that, all but the first occurance are free. + if (!IterationNumber->isZero() && SE.isLoopInvariant(S, L)) + return true; + auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) return false; Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -725,13 +725,73 @@ return MaxPercentThresholdBoost; } -// Returns loop size estimation for unrolled loop. -static uint64_t getUnrolledLoopSize( - unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP) { - assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; -} +/// Helper class to estimate the size of a loop with a given unroll factor. +/// Note that there's a more expensive estimator used for full unrolling if the +/// result of this one doesn't show profit, but this is the only estimate used +/// for partial and runtime unrolling. +/// +/// TODO: There are a number of ways to make this more sophisticated. +/// 1) Extend the uniform logic to understand expressions which are uniform for +/// a given number of iterations (but not all iterations). An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i/4]) +/// b[i]; +/// } +/// 2) Recognize patterns which repeat predictably with unrolling. An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i % 4]) +/// b[i]; +/// } +/// 3) Integrate PeelCount. If we know we're peeling once, the globally +/// uniform expressions don't contribute to the loop size of the remaining +/// (potentially unrollable) loop. +/// 4) Use aliasing information to reason about CSEable loads. Consider the +/// load from 'a' in example #1 with an unroll factor of 4. +/// 5) Before this gets too complicated, we should consider merging this with +/// the static unroll cost model. +class LoopSizeEstimator { + Loop &TheLoop; + const TargetTransformInfo &TTI; + ScalarEvolution &SE; + // Note: Both 'size' fields here are in units of TTI->getUserCost(, CodeSize), + // not instruction counts. + unsigned LoopSize; + unsigned UniformSize; + +public: + LoopSizeEstimator(Loop &L, const TargetTransformInfo &TTI, + ScalarEvolution &SE, unsigned LoopSize) + : TheLoop(L), TTI(TTI), SE(SE), LoopSize(LoopSize), UniformSize(-1U) { + } + + uint64_t getEstimatedLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { + if (-1U == UniformSize) + UniformSize = computeUniformInstCost(); + unsigned LocalUniformSize = UniformSize; + // Avoid degenerate case where entire loop body is uniform across all + // iterations. We should break the backedge, not unroll that. + if (LocalUniformSize + UP.BEInsns + 1 > LoopSize) + LocalUniformSize = 0; + + assert(LoopSize >= (UP.BEInsns + LocalUniformSize) && + "LoopSize should not be less than BEInsns!"); + return (uint64_t)(LoopSize - UP.BEInsns - LocalUniformSize) * UP.Count + + UP.BEInsns + LocalUniformSize; + } +private: + unsigned computeUniformInstCost() { + unsigned Count = 0; + for (BasicBlock *BB : TheLoop.blocks()) + for (Instruction &I : *BB) { + if (!SE.isSCEVable(I.getType())) + continue; + if (!SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop)) + continue; + Count += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); + } + return Count; + } +}; // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. @@ -749,6 +809,8 @@ TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { + LoopSizeEstimator LSE(*L, TTI, SE, LoopSize); + // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; @@ -756,7 +818,7 @@ UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) + if (UP.AllowRemainder && LSE.getEstimatedLoopSize(UP) < UP.Threshold) return true; } @@ -768,13 +830,13 @@ UP.AllowExpensiveTripCount = true; UP.Force = true; if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && - getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return true; } bool PragmaFullUnroll = hasUnrollFullPragma(L); if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; - if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + if (LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return false; } @@ -823,7 +885,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { + if (LSE.getEstimatedLoopSize(UP) < UP.Threshold) { UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -870,7 +932,7 @@ UP.Count = TripCount; if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + if (LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / (LoopSize - UP.BEInsns); @@ -885,7 +947,7 @@ // remainder loop is allowed. UP.Count = UP.DefaultUnrollRuntimeCount; while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; } if (UP.Count < 2) { @@ -969,7 +1031,7 @@ // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; #ifndef NDEBUG Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll +++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll @@ -116,7 +116,7 @@ ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]] ; CHECK-NEXT: [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4 ; CHECK-NEXT: call void @bar(i32 [[DOTPRE_2]]) -; CHECK-NEXT: [[INC_3]] = add nsw i64 [[INC_2]], 1 +; CHECK-NEXT: [[INC_3]] = add nuw nsw i64 [[INC_2]], 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]] ; CHECK: for.body.3: ; CHECK-NEXT: [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]]) @@ -124,7 +124,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !0 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP0:!llvm.loop !.*]] ; entry: br i1 true, label %for.preheader, label %for.end @@ -202,7 +202,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !2 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP2:!llvm.loop !.*]] ; entry: %0 = load i32, i32* %A, align 4