Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -737,17 +737,41 @@ // cheaply estimate cost for full unrolling when we don't want to symbolically // evaluate all iterations. class UnrollCostEstimator { + Loop &TheLoop; + const TargetTransformInfo &TTI; + ScalarEvolution &SE; + // Note: Both "size" fields here are in units of TTI->getUserCost(, CodeSize), + // not instruction counts. const unsigned LoopSize; + Optional UniformSize; public: - UnrollCostEstimator(Loop &L, unsigned LoopSize) : LoopSize(LoopSize) {} + UnrollCostEstimator(Loop &L, const TargetTransformInfo &TTI, + ScalarEvolution &SE, unsigned LoopSize) + : TheLoop(L), TTI(TTI), SE(SE), LoopSize(LoopSize) {} // Returns loop size estimation for unrolled loop, given the unrolling // configuration specified by UP. uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { - assert(LoopSize >= UP.BEInsns && + unsigned UniformSize = computeUniformInstCost(); + assert(LoopSize >= (UP.BEInsns + UniformSize) && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; + return (uint64_t)(LoopSize - UP.BEInsns - UniformSize) * UP.Count + + UP.BEInsns + UniformSize; + } + +private: + unsigned computeUniformInstCost() { + if (!UniformSize) { + InstructionCost Size = 0; + for (BasicBlock *BB : TheLoop.blocks()) + for (Instruction &I : *BB) + if (SE.isSCEVable(I.getType()) && + SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop)) + Size += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); + UniformSize = *Size.getValue(); + } + return *UniformSize; } }; @@ -767,7 +791,7 @@ TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { - UnrollCostEstimator UCE(*L, LoopSize); + UnrollCostEstimator UCE(*L, TTI, SE, LoopSize); // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. Index: llvm/test/Transforms/LoopUnroll/full-unroll-invariant-2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/full-unroll-invariant-2.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-threshold=104 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(loop-unroll-full)' -unroll-threshold=104 | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Check that the course (not symbolic execution based) unroll cost model +; discounts invariant instructions. This test file is split from +; full-unroll-invariant.ll because we need a different threshold value due +; to an unrelated cost model problem. The loop below should have a cost of +; 1, but the handling of BEInsts results in it having a cost of 103. + +define i32 @test4(i8 %a) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ZEXT_99:%.*]] = zext i8 [[A:%.*]] to i32 +; CHECK-NEXT: ret i32 [[ZEXT_99]] +; +entry: + br label %for.body + +for.body: + %phi = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %zext = zext i8 %a to i32 + %inc = add nuw nsw i64 %phi, 1 + %cmp = icmp ult i64 %inc, 100 + br i1 %cmp, label %for.body, label %for.exit + +for.exit: + ret i32 %zext +} Index: llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll +++ llvm/test/Transforms/LoopUnroll/nonlatchcondbr.ll @@ -116,7 +116,7 @@ ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_2]] ; CHECK-NEXT: [[DOTPRE_2:%.*]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_2]], align 4 ; CHECK-NEXT: call void @bar(i32 [[DOTPRE_2]]) -; CHECK-NEXT: [[INC_3]] = add nsw i64 [[INC_2]], 1 +; CHECK-NEXT: [[INC_3]] = add nuw nsw i64 [[INC_2]], 1 ; CHECK-NEXT: br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE_3]] ; CHECK: for.body.3: ; CHECK-NEXT: [[CMP_3:%.*]] = call i1 @foo(i64 [[INC_2]]) @@ -124,7 +124,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !0 +; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; entry: br i1 true, label %for.preheader, label %for.end @@ -202,7 +202,7 @@ ; CHECK: for.body.for.body_crit_edge.3: ; CHECK-NEXT: [[ARRAYIDX_PHI_TRANS_INSERT_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INC_3]] ; CHECK-NEXT: [[DOTPRE_3]] = load i32, i32* [[ARRAYIDX_PHI_TRANS_INSERT_3]], align 4 -; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop !2 +; CHECK-NEXT: br label [[FOR_HEADER]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: %0 = load i32, i32* %A, align 4