Index: llvm/lib/Analysis/LoopUnrollAnalyzer.cpp =================================================================== --- llvm/lib/Analysis/LoopUnrollAnalyzer.cpp +++ llvm/lib/Analysis/LoopUnrollAnalyzer.cpp @@ -35,6 +35,11 @@ return true; } + // If we have a loop invariant computation, we only need to compute it once. + // Given that, all but the first occurance are free. + if (!IterationNumber->isZero() && SE.isLoopInvariant(S, L)) + return true; + auto *AR = dyn_cast(S); if (!AR || AR->getLoop() != L) return false; Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -725,13 +725,67 @@ return MaxPercentThresholdBoost; } -// Returns loop size estimation for unrolled loop. -static uint64_t getUnrolledLoopSize( - unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP) { - assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; -} +/// Helper class to estimate the size of a loop with a given unroll factor. +/// Note that there's a more expensive estimator used for full unrolling if the +/// result of this one doesn't show profit, but this is the only estimate used +/// for partial and runtime unrolling. +/// +/// TODO: There are a number of ways to make this more sophisticated. +/// 1) Extend the uniform logic to understand expressions which are uniform for +/// a given number of iterations (but not all iterations). An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i/4]) +/// b[i++; +/// } +/// 2) Recognize patterns which repeat predictably with unrolling. An example: +/// for (int i = 0; i < N; i++) { +/// if (a[i % 4]) +/// b[i++; +/// } +/// 3) Integrate PeelCount. If we know we're peeling once, the globally +/// uniform expressions don't contribute to the loop size of the remaining +/// (potentially unrollable) loop. +/// 4) Use aliasing information to reason about CSEable loads. Consider the +/// load from 'a' in example #1 with an unroll factor of 4. +/// 5) Before this gets too complicated, we should consider merging this with +/// the static unroll cost model. +class LoopSizeEstimator { + Loop &TheLoop; + ScalarEvolution &SE; + unsigned LoopSize; + unsigned UniformSize; + +public: + LoopSizeEstimator(Loop &L, ScalarEvolution &SE, unsigned LoopSize) + : TheLoop(L), SE(SE), LoopSize(LoopSize) { + UniformSize = computeUniformInstCount(); + } + + uint64_t getEstimatedLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { + unsigned LocalUniformSize = UniformSize; + // Avoid degenerate case where entire loop body is uniform across all + // iterations. We should break the backedge, not unroll that. + if (LocalUniformSize + UP.BEInsns + 1 > LoopSize) + LocalUniformSize = 0; + + assert(LoopSize >= (UP.BEInsns + LocalUniformSize) && + "LoopSize should not be less than BEInsns!"); + return (uint64_t)(LoopSize - UP.BEInsns - LocalUniformSize) * UP.Count + + UP.BEInsns + LocalUniformSize; + } +private: + unsigned computeUniformInstCount() { + unsigned Count = 0; + for (BasicBlock *BB : TheLoop.blocks()) + for (Instruction &I : *BB) { + if (!SE.isSCEVable(I.getType())) + continue; + if (SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop)) + Count++; + } + return Count; + } +}; // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. @@ -749,6 +803,8 @@ TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { + LoopSizeEstimator LSE(*L, SE, LoopSize); + // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0; @@ -756,7 +812,7 @@ UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) + if (UP.AllowRemainder && LSE.getEstimatedLoopSize(UP) < UP.Threshold) return true; } @@ -768,13 +824,13 @@ UP.AllowExpensiveTripCount = true; UP.Force = true; if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && - getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return true; } bool PragmaFullUnroll = hasUnrollFullPragma(L); if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; - if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold) + if (LSE.getEstimatedLoopSize(UP) < PragmaUnrollThreshold) return false; } @@ -823,7 +879,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { + if (LSE.getEstimatedLoopSize(UP) < UP.Threshold) { UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -870,7 +926,7 @@ UP.Count = TripCount; if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + if (LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / (LoopSize - UP.BEInsns); @@ -885,7 +941,7 @@ // remainder loop is allowed. UP.Count = UP.DefaultUnrollRuntimeCount; while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; } if (UP.Count < 2) { @@ -969,7 +1025,7 @@ // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. while (UP.Count != 0 && - getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold) + LSE.getEstimatedLoopSize(UP) > UP.PartialThreshold) UP.Count >>= 1; #ifndef NDEBUG Index: llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/X86/partial-uniform.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -unroll-partial-threshold=36 -unroll-threshold=0 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=0 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1" +target triple = "x86_64-unknown-linux-gnu" + +;; Two instructions are loop invariant and thus don't count against +;; the unroll cost for iteration > 1. As a result, we can unroll this +;; at cost 36 not 42. +define i32 @uniform_contents(i32* %A, i32* %B, i32 %c, i32 %d) { +; CHECK-LABEL: @uniform_contents( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT_3:%.*]], [[LATCH_3:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ACCUM_NEXT_3:%.*]], [[LATCH_3]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1]], align 4 +; CHECK-NEXT: [[EARLYEXIT:%.*]] = icmp eq i32 [[V1]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT]], label [[LOOP_EXIT:%.*]], label [[LATCH:%.*]] +; CHECK: latch: +; CHECK-NEXT: [[INDEX:%.*]] = udiv i32 [[C:%.*]], [[D:%.*]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP2]], align 4 +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[ACCUM_NEXT:%.*]] = add i32 [[ACCUM]], [[V2]] +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[V1_1:%.*]] = load i32, i32* [[GEP1_1]], align 4 +; CHECK-NEXT: [[EARLYEXIT_1:%.*]] = icmp eq i32 [[V1_1]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_1]], label [[LOOP_EXIT]], label [[LATCH_1:%.*]] +; CHECK: loop_exit: +; CHECK-NEXT: [[ACCUM_LCSSA:%.*]] = phi i32 [ [[ACCUM]], [[LOOP]] ], [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[ACCUM_NEXT_1:%.*]], [[LATCH_1]] ], [ [[ACCUM_NEXT_1]], [[LATCH_2:%.*]] ], [ [[ACCUM_NEXT_2:%.*]], [[LOOP_3:%.*]] ] +; CHECK-NEXT: ret i32 [[ACCUM_LCSSA]] +; CHECK: latch.1: +; CHECK-NEXT: [[INDEX_1:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_1]] +; CHECK-NEXT: [[V2_1:%.*]] = load i32, i32* [[GEP2_1]], align 4 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_NEXT]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_1]] = add i32 [[ACCUM_NEXT]], [[V2_1]] +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[V1_2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[EARLYEXIT_2:%.*]] = icmp eq i32 [[V1_2]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_2]], label [[LOOP_EXIT]], label [[LATCH_2]] +; CHECK: latch.2: +; CHECK-NEXT: [[INDEX_2:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_2]] +; CHECK-NEXT: [[V2_2:%.*]] = load i32, i32* [[GEP2_2]], align 4 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_NEXT_1]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_2]] = add i32 [[ACCUM_NEXT_1]], [[V2_2]] +; CHECK-NEXT: [[EXIT_2:%.*]] = icmp eq i64 [[IV_NEXT_1]], 198 +; CHECK-NEXT: br i1 [[EXIT_2]], label [[LOOP_EXIT]], label [[LOOP_3]] +; CHECK: loop.3: +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, i32* [[A]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[V1_3:%.*]] = load i32, i32* [[GEP1_3]], align 4 +; CHECK-NEXT: [[EARLYEXIT_3:%.*]] = icmp eq i32 [[V1_3]], 0 +; CHECK-NEXT: br i1 [[EARLYEXIT_3]], label [[LOOP_EXIT]], label [[LATCH_3]] +; CHECK: latch.3: +; CHECK-NEXT: [[INDEX_3:%.*]] = udiv i32 [[C]], [[D]] +; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[A]], i32 [[INDEX_3]] +; CHECK-NEXT: [[V2_3:%.*]] = load i32, i32* [[GEP2_3]], align 4 +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV_NEXT_2]], 1 +; CHECK-NEXT: [[ACCUM_NEXT_3]] = add i32 [[ACCUM_NEXT_2]], [[V2_3]] +; CHECK-NEXT: br label [[LOOP]] +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] + %accum = phi i32 [ 0, %entry ], [ %accum.next, %latch ] + %gep1 = getelementptr i32, i32* %A, i64 %iv + %v1 = load i32, i32* %gep1 + %earlyexit = icmp eq i32 %v1, 0 + br i1 %earlyexit, label %loop_exit, label %latch +latch: + %index = udiv i32 %c, %d + %gep2 = getelementptr i32, i32* %A, i32 %index + %v2 = load i32, i32* %gep2 + %iv.next = add i64 %iv, 1 + %accum.next = add i32 %accum, %v2 + %exit = icmp eq i64 %iv, 198 + br i1 %exit, label %loop_exit, label %loop + +loop_exit: + ret i32 %accum +} + Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll +++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll @@ -116,7 +116,7 @@ ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]] ; CHECK-NEXT: [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4 ; CHECK-NEXT: call void @bar(i32 [[DOTPRE_2]]) -; CHECK-NEXT: [[INC_3]] = add nsw i64 [[INC_2]], 1 +; CHECK-NEXT: [[INC_3]] = add nuw nsw i64 [[INC_2]], 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]] ; CHECK: for.body.3: ; CHECK-NEXT: [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]]) @@ -124,7 +124,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !0 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP0:!llvm.loop !.*]] ; entry: br i1 true, label %for.preheader, label %for.end @@ -202,7 +202,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !2 +; CHECK-NEXT: br label [[FOR_HEADER]], [[LOOP2:!llvm.loop !.*]] ; entry: %0 = load i32, i32* %A, align 4