Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -243,6 +243,12 @@ /// profitable. Set this to UINT_MAX to disable the loop body cost /// restriction. unsigned Threshold; + /// The cost threshold for the fully unrolled loop. + /// We use a different threshold than dynamic/partial unroll because + /// fully unrolling will not cause frontend issues such as loop stream + /// detector fitting. As a result it is considered to always improve + /// performance and should be higher than dynamic/partial unroll threshold. + unsigned FullThreshold; /// If complete unrolling will reduce the cost of the loop, we will boost /// the Threshold by a certain percent to allow more aggressive complete /// unrolling. This value provides the maximum boost percentage that we Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -46,6 +46,10 @@ UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The baseline cost threshold for loop unrolling")); +static cl::opt UnrollFullThreshold( + "unroll-full-threshold", cl::Hidden, + cl::desc("The cost threshold for fully loop unrolling")); + static cl::opt UnrollMaxPercentThresholdBoost( "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied " @@ -125,6 +129,7 @@ // Set up the defaults UP.Threshold = 150; + UP.FullThreshold = 300; UP.MaxPercentThresholdBoost = 400; UP.OptSizeThreshold = 0; UP.PartialThreshold = UP.Threshold; @@ -157,6 +162,9 @@ UP.Threshold = UnrollThreshold; UP.PartialThreshold = UnrollThreshold; } + if (UnrollFullThreshold.getNumOccurrences() > 0) { + UP.FullThreshold = UnrollFullThreshold; + } if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0) UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) @@ -746,7 +754,7 @@ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { + if (getUnrolledLoopSize(LoopSize, UP) < UP.FullThreshold) { UseUpperBound = (MaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; @@ -757,10 +765,10 @@ // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( L, FullUnrollTripCount, DT, *SE, TTI, - UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { + UP.FullThreshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); - if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { + if (Cost->UnrolledCost < UP.FullThreshold * Boost / 100) { UseUpperBound = (MaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; Index: test/Transforms/BBVectorize/loop1.ll =================================================================== --- test/Transforms/BBVectorize/loop1.ll +++ test/Transforms/BBVectorize/loop1.ll @@ -1,7 +1,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL +; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-full-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL ; The second check covers the use of alias analysis (with loop unrolling). define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { Index: test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-2.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-full-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @unknown_global = internal unnamed_addr global [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 Index: test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics-geps.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=100 -unroll-full-threshold=10 -unroll-max-percent-threshold-boost=200 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; When examining gep-instructions we shouldn't consider them simplified if the Index: test/Transforms/LoopUnroll/full-unroll-heuristics.ll =================================================================== --- test/Transforms/LoopUnroll/full-unroll-heuristics.ll +++ test/Transforms/LoopUnroll/full-unroll-heuristics.ll @@ -1,13 +1,13 @@ ; In this test we check how heuristics for complete unrolling work. We have ; three knobs: -; 1) -unroll-threshold +; 1) -unroll-full-threshold ; 3) -unroll-percent-dynamic-cost-saved-threshold and ; 2) -unroll-dynamic-cost-savings-discount ; ; They control loop-unrolling according to the following rules: ; * If size of unrolled loop exceeds the absoulte threshold, we don't unroll ; this loop under any circumstances. -; * If size of unrolled loop is below the '-unroll-threshold', then we'll +; * If size of unrolled loop is below the '-unroll-full-threshold', then we'll ; consider this loop as a very small one, and completely unroll it. ; * If a loop size is between these two tresholds, we only do complete unroll ; it if estimated number of potentially optimized instructions is high (we @@ -17,9 +17,9 @@ ; optimizations to remove ~55% of the instructions, the loop body size is 9, ; and unrolled size is 65. -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=10 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST1 -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2 -; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-full-threshold=10 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST1 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-full-threshold=20 -unroll-max-percent-threshold-boost=200 | FileCheck %s -check-prefix=TEST2 +; RUN: opt < %s -S -loop-unroll -unroll-max-iteration-count-to-analyze=1000 -unroll-full-threshold=20 -unroll-max-percent-threshold-boost=100 | FileCheck %s -check-prefix=TEST3 ; If the absolute threshold is too low, we should not unroll: ; TEST1: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv Index: test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll =================================================================== --- test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll +++ test/Transforms/LoopUnroll/partial-unroll-const-bounds.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -S -unroll-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s +; RUN: opt < %s -S -unroll-threshold=20 -unroll-full-threshold=20 -loop-unroll -unroll-allow-partial -unroll-runtime -unroll-allow-remainder -unroll-max-percent-threshold-boost=100 | FileCheck %s ; The Loop TripCount is 9. However unroll factors 3 or 9 exceed given threshold. ; The test checks that we choose a smaller, power-of-two, unroll count and do not give up on unrolling. Index: test/Transforms/LoopUnroll/unroll-pragmas.ll =================================================================== --- test/Transforms/LoopUnroll/unroll-pragmas.ll +++ test/Transforms/LoopUnroll/unroll-pragmas.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s -; RUN: opt < %s -loop-unroll -loop-unroll -pragma-unroll-threshold=1024 -S | FileCheck %s +; RUN: opt < %s -loop-unroll -unroll-full-threshold=150 -pragma-unroll-threshold=1024 -S | FileCheck %s +; RUN: opt < %s -loop-unroll -loop-unroll -unroll-full-threshold=150 -pragma-unroll-threshold=1024 -S | FileCheck %s ; ; Run loop unrolling twice to verify that loop unrolling metadata is properly ; removed and further unrolling is disabled after the pass is run once. Index: test/Transforms/LoopVectorize/X86/metadata-enable.ll =================================================================== --- test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -1,13 +1,13 @@ -; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1 -; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2 -; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 -; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os -; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz -; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC -; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC -; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2 -; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2 -; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1 +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2 +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3 +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2 +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2 +; RUN: opt < %s -mcpu=corei7 -unroll-full-threshold=150 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS ; This file tests the llvm.loop.vectorize.enable metadata forcing ; vectorization even when optimization levels are too low, or when