Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -739,17 +739,55 @@ // cheaply estimate cost for full unrolling when we don't want to symbolically // evaluate all iterations. class UnrollCostEstimator { + Loop &TheLoop; + const TargetTransformInfo &TTI; + ScalarEvolution &SE; + // Note: Both "size" fields here are in units of TTI->getUserCost(, CodeSize), + // not instruction counts. const unsigned LoopSize; + Optional UniformSize; public: - UnrollCostEstimator(Loop &L, unsigned LoopSize) : LoopSize(LoopSize) {} + UnrollCostEstimator(Loop &L, const TargetTransformInfo &TTI, + ScalarEvolution &SE, unsigned LoopSize) + : TheLoop(L), TTI(TTI), SE(SE), LoopSize(LoopSize) {} + + + bool + isUnrolledLoopUnderThreshold(TargetTransformInfo::UnrollingPreferences &UP, + unsigned Threshold) { + // Check the cheap form first before falling back to the expensive + // SCEV queries. Most loops are small, so the expensive path should + // rarely run. + if (getUnrolledLoopSize(UP, false) < Threshold) + return true; + return getUnrolledLoopSize(UP, true) < Threshold; + } + +private: // Returns loop size estimation for unrolled loop, given the unrolling // configuration specified by UP. - uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP) { - assert(LoopSize >= UP.BEInsns && + uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP, + bool UseUniform) { + unsigned UniformSize = UseUniform ? computeUniformInstCost() : 0; + assert(LoopSize >= (UP.BEInsns + UniformSize) && "LoopSize should not be less than BEInsns!"); - return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns; + return (uint64_t)(LoopSize - UP.BEInsns - UniformSize) * UP.Count + + UP.BEInsns + UniformSize; + } + + unsigned computeUniformInstCost() { + if (!UniformSize) { + InstructionCost Size = 0; + for (BasicBlock *BB : TheLoop.blocks()) + for (Instruction &I : *BB) + if (SE.isSCEVable(I.getType()) && + SE.isLoopInvariant(SE.getSCEV(&I), &TheLoop)) + Size += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize); + UniformSize = *Size.getValue(); + } + return *UniformSize; } }; @@ -769,7 +807,7 @@ TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) { - UnrollCostEstimator UCE(*L, LoopSize); + UnrollCostEstimator UCE(*L, TTI, SE, LoopSize); // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. @@ -778,7 +816,7 @@ UP.Count = UnrollCount; UP.AllowExpensiveTripCount = true; UP.Force = true; - if (UP.AllowRemainder && UCE.getUnrolledLoopSize(UP) < UP.Threshold) + if (UP.AllowRemainder && UCE.isUnrolledLoopUnderThreshold(UP, UP.Threshold)) return true; } @@ -790,13 +828,13 @@ UP.AllowExpensiveTripCount = true; UP.Force = true; if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) && - UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) + UCE.isUnrolledLoopUnderThreshold(UP, PragmaUnrollThreshold)) return true; } bool PragmaFullUnroll = hasUnrollFullPragma(L); if (PragmaFullUnroll && TripCount != 0) { UP.Count = TripCount; - if (UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold) + if (UCE.isUnrolledLoopUnderThreshold(UP, PragmaUnrollThreshold)) return false; } @@ -845,7 +883,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) { + if (UCE.isUnrolledLoopUnderThreshold(UP, UP.Threshold)) { UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -892,7 +930,7 @@ UP.Count = TripCount; if (UP.PartialThreshold != NoThreshold) { // Reduce unroll count to be modulo of TripCount for partial unrolling. - if (UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold) + if (!UCE.isUnrolledLoopUnderThreshold(UP, UP.PartialThreshold)) UP.Count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / (LoopSize - UP.BEInsns); @@ -907,7 +945,7 @@ // remainder loop is allowed. UP.Count = UP.DefaultUnrollRuntimeCount; while (UP.Count != 0 && - UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold) + !UCE.isUnrolledLoopUnderThreshold(UP, UP.PartialThreshold)) UP.Count >>= 1; } if (UP.Count < 2) { @@ -991,7 +1029,7 @@ // Reduce unroll count to be the largest power-of-two factor of // the original count which satisfies the threshold limit. while (UP.Count != 0 && - UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold) + !UCE.isUnrolledLoopUnderThreshold(UP, UP.PartialThreshold)) UP.Count >>= 1; #ifndef NDEBUG Index: llvm/test/Transforms/LoopUnroll/full-unroll-invariant-2.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/full-unroll-invariant-2.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-threshold=104 | FileCheck %s +; RUN: opt < %s -S -passes='require,loop(loop-unroll-full)' -unroll-threshold=104 | FileCheck %s + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" + +; Check that the course (not symbolic execution based) unroll cost model +; discounts invariant instructions. This test file is split from +; full-unroll-invariant.ll because we need a different threshold value due +; to an unrelated cost model problem. The loop below should have a cost of +; 1, but the handling of BEInsts results in it having a cost of 103. + +define i32 @test4(i8 %a) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[ZEXT_99:%.*]] = zext i8 [[A:%.*]] to i32 +; CHECK-NEXT: ret i32 [[ZEXT_99]] +; +entry: + br label %for.body + +for.body: + %phi = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %zext = zext i8 %a to i32 + %inc = add nuw nsw i64 %phi, 1 + %cmp = icmp ult i64 %inc, 100 + br i1 %cmp, label %for.body, label %for.exit + +for.exit: + ret i32 %zext +}