Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -243,13 +243,11 @@ /// profitable. Set this to UINT_MAX to disable the loop body cost /// restriction. unsigned Threshold; - /// If complete unrolling will reduce the cost of the loop below its - /// expected dynamic cost while rolled by this percentage, apply a discount - /// (below) to its unrolled cost. - unsigned PercentDynamicCostSavedThreshold; - /// The discount applied to the unrolled cost when the *dynamic* cost - /// savings of unrolling exceed the \c PercentDynamicCostSavedThreshold. - unsigned DynamicCostSavingsDiscount; + /// If complete unrolling will reduce the cost of the loop, we will boost + /// the Threshold by a certain percent to allow more aggressive complete + /// unrolling. This value provides the maximum boost percentage that we + /// can apply to Threshold (The value should be no less than 100). + unsigned PercentMaxThresholdBoost; /// The cost threshold for the unrolled loop when optimizing for size (set /// to UINT_MAX to disable). unsigned OptSizeThreshold; Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -46,16 +46,10 @@ UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The baseline cost threshold for loop unrolling")); -static cl::opt UnrollPercentDynamicCostSavedThreshold( - "unroll-percent-dynamic-cost-saved-threshold", cl::init(50), cl::Hidden, - cl::desc("The percentage of estimated dynamic cost which must be saved by " - "unrolling to allow unrolling up to the max threshold.")); - -static cl::opt UnrollDynamicCostSavingsDiscount( - "unroll-dynamic-cost-savings-discount", cl::init(100), cl::Hidden, - cl::desc("This is the amount discounted from the total unroll cost when " - "the unrolled form has a high dynamic cost savings (triggered by " - "the '-unroll-perecent-dynamic-cost-saved-threshold' flag).")); +static cl::opt PercentMaxThresholdBoost( + "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, + cl::desc("Boost the threshold for complete unroll by a max percent to " + "allow aggressive unrolling to reduce run time")); static cl::opt UnrollMaxIterationsCountToAnalyze( "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden, @@ -127,8 +121,7 @@ // Set up the defaults UP.Threshold = 150; - UP.PercentDynamicCostSavedThreshold = 50; - UP.DynamicCostSavingsDiscount = 100; + UP.PercentMaxThresholdBoost = 400; UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; UP.PartialOptSizeThreshold = 0; @@ -160,11 +153,8 @@ UP.Threshold = UnrollThreshold; UP.PartialThreshold = UnrollThreshold; } - if (UnrollPercentDynamicCostSavedThreshold.getNumOccurrences() > 0) - UP.PercentDynamicCostSavedThreshold = - UnrollPercentDynamicCostSavedThreshold; - if (UnrollDynamicCostSavingsDiscount.getNumOccurrences() > 0) - UP.DynamicCostSavingsDiscount = UnrollDynamicCostSavingsDiscount; + if (PercentMaxThresholdBoost.getNumOccurrences() > 0) + UP.PercentMaxThresholdBoost = PercentMaxThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) UP.MaxCount = UnrollMaxCount; if (UnrollFullMaxCount.getNumOccurrences() > 0) @@ -662,58 +652,6 @@ L->setLoopID(NewLoopID); } -static bool canUnrollCompletely(Loop *L, unsigned Threshold, - unsigned PercentDynamicCostSavedThreshold, - unsigned DynamicCostSavingsDiscount, - uint64_t UnrolledCost, - uint64_t RolledDynamicCost) { - if (Threshold == NoThreshold) { - DEBUG(dbgs() << " Can fully unroll, because no threshold is set.\n"); - return true; - } - - if (UnrolledCost <= Threshold) { - DEBUG(dbgs() << " Can fully unroll, because unrolled cost: " - << UnrolledCost << "<=" << Threshold << "\n"); - return true; - } - - assert(UnrolledCost && "UnrolledCost can't be 0 at this point."); - assert(RolledDynamicCost >= UnrolledCost && - "Cannot have a higher unrolled cost than a rolled cost!"); - - // Compute the percentage of the dynamic cost in the rolled form that is - // saved when unrolled. If unrolling dramatically reduces the estimated - // dynamic cost of the loop, we use a higher threshold to allow more - // unrolling. - unsigned PercentDynamicCostSaved = - (uint64_t)(RolledDynamicCost - UnrolledCost) * 100ull / RolledDynamicCost; - - if (PercentDynamicCostSaved >= PercentDynamicCostSavedThreshold && - (int64_t)UnrolledCost - (int64_t)DynamicCostSavingsDiscount <= - (int64_t)Threshold) { - DEBUG(dbgs() << " Can fully unroll, because unrolling will reduce the " - "expected dynamic cost by " - << PercentDynamicCostSaved << "% (threshold: " - << PercentDynamicCostSavedThreshold << "%)\n" - << " and the unrolled cost (" << UnrolledCost - << ") is less than the max threshold (" - << DynamicCostSavingsDiscount << ").\n"); - return true; - } - - DEBUG(dbgs() << " Too large to fully unroll:\n"); - DEBUG(dbgs() << " Threshold: " << Threshold << "\n"); - DEBUG(dbgs() << " Max threshold: " << DynamicCostSavingsDiscount << "\n"); - DEBUG(dbgs() << " Percent cost saved threshold: " - << PercentDynamicCostSavedThreshold << "%\n"); - DEBUG(dbgs() << " Unrolled cost: " << UnrolledCost << "\n"); - DEBUG(dbgs() << " Rolled dynamic cost: " << RolledDynamicCost << "\n"); - DEBUG(dbgs() << " Percent cost saved: " << PercentDynamicCostSaved - << "\n"); - return false; -} - // Returns loop size estimation for unrolled loop. static uint64_t getUnrolledLoopSize( unsigned LoopSize, @@ -787,9 +725,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, - getUnrolledLoopSize(LoopSize, UP), - getUnrolledLoopSize(LoopSize, UP))) { + if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { UseUpperBound = (MaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -800,16 +736,21 @@ // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( L, FullUnrollTripCount, DT, *SE, TTI, - UP.Threshold + UP.DynamicCostSavingsDiscount)) - if (canUnrollCompletely(L, UP.Threshold, - UP.PercentDynamicCostSavedThreshold, - UP.DynamicCostSavingsDiscount, - Cost->UnrolledCost, Cost->RolledDynamicCost)) { + UP.Threshold * UP.PercentMaxThresholdBoost / 100)) { + unsigned Benefit = + Cost->RolledDynamicCost * 100 >= + Cost->UnrolledCost * UP.PercentMaxThresholdBoost + ? UP.PercentMaxThresholdBoost + : Cost->RolledDynamicCost * 100 / Cost->UnrolledCost; + unsigned ThresholdBoost = std::min(Benefit * Benefit / 10000, + UP.PercentMaxThresholdBoost / 100); + if (Cost->UnrolledCost < UP.Threshold * ThresholdBoost) { UseUpperBound = (MaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; } + } } } Index: test/Transforms/LoopUnroll/full-unroll-crashers.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-crashers.ll +++ test/Transforms/LoopUnroll/full-unroll-crashers.ll @@ -1,5 +1,5 @@ ; Check that we don't crash on corner cases. -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -unroll-percent-dynamic-cost-saved-threshold=20 -o /dev/null +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=1 -o /dev/null target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=70 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-cmp.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=40 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-dce.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @known_constant = internal unnamed_addr constant [10 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=60 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; When examining gep-instructions we shouldn't consider them simplified if the Index: test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-phi-prop.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-dynamic-cost-savings-discount=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=50 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" define i64 @propagate_loop_phis() { Index: test/Transforms/LoopUnroll/full-unroll-heuristics.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics.ll @@ -17,21 +17,18 @@ ; optimizations to remove ~55% of the instructions, the loop body size is 9, ; and unrolled size is 65. -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST1 -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=20 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST2 -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=90 | FileCheck %s -check-prefix=TEST3 -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=100 -unroll-percent-dynamic-cost-saved-threshold=80 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s -check-prefix=TEST4 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=5 | FileCheck %s -check-prefix=TEST1 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 | FileCheck %s -check-prefix=TEST2 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3 -; If the absolute threshold is too low, or if we can't optimize away requested -; percent of instructions, we shouldn't unroll: +; If the absolute threshold is too low, we should not unroll: ; TEST1: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv -; TEST3: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv ; Otherwise, we should: ; TEST2-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv -; Also, we should unroll if the 'unroll-threshold' is big enough: -; TEST4-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv +; If we do not boost threshold, the unroll will not happen: +; TEST3: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv ; And check that we don't crash when we're not allowed to do any analysis. ; RUN: opt < %s -loop-unroll -unroll-max-iteration-count-to-analyze=0 -disable-output Index: test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll =================================================================== --- test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll +++ test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-dynamic-cost-savings-discount=0 | FileCheck %s +; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder | FileCheck %s ; The Loop TripCount is 9. However unroll factors 3 or 9 exceed given threshold. ; The test checks that we choose a smaller, power-of-two, unroll count and do not give up on unrolling. @@ -21,7 +21,7 @@ %st = trunc i64 %indvars.iv to i32 store i32 %st, i32* %arrayidx2, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, 10 + %exitcond = icmp eq i64 %indvars.iv.next, 20 br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body Index: test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=40 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=40 | FileCheck %s @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16