Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -114,12 +114,13 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); -/// We don't vectorize loops with a known constant trip count below this number. +/// Loops with a known constant trip count below this number are vectorized only +/// if no scalar iteration overheads are incurred. static cl::opt TinyTripCountVectorThreshold( "vectorizer-min-trip-count", cl::init(16), cl::Hidden, - cl::desc("Don't vectorize loops with a constant " - "trip count that is smaller than this " - "value.")); + cl::desc("Loops with a constant trip count that is smaller than this " + "value are vectorized only if no scalar iteration overheads " + "are incurred.")); static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -7801,8 +7802,25 @@ return false; } - // Check the loop for a trip count threshold: - // do not vectorize loops with a tiny trip count. + PredicatedScalarEvolution PSE(*SE, *L); + + // Check if it is legal to vectorize the loop. + LoopVectorizationRequirements Requirements(*ORE); + LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, + &Requirements, &Hints); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); + emitMissedWarning(F, L, Hints, ORE); + return false; + } + + // Check the function attributes to find out if this function should be + // optimized for size. + bool OptForSize = + Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); + + // Check the loop for a trip count threshold: vectorize loops with a tiny trip + // count by optimizing for size, to minimize overheads. unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L); bool HasExpectedTC = (ExpectedTC > 0); @@ -7816,36 +7834,19 @@ if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " - << "This loop is not worth vectorizing."); + << "This loop is worth vectorizing only if no scalar " + << "iteration overheads are incurred."); if (Hints.getForce() == LoopVectorizeHints::FK_Enabled) DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { DEBUG(dbgs() << "\n"); - ORE->emit(createMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NotBeneficial", L) - << "vectorization is not beneficial " - "and is not explicitly forced"); - return false; + // Loops with a very small trip count are considered for vectorization + // under OptForSize, thereby making sure the cost of their loop body is + // dominant, free of runtime guards and scalar iteration overheads. + OptForSize = true; } } - PredicatedScalarEvolution PSE(*SE, *L); - - // Check if it is legal to vectorize the loop. - LoopVectorizationRequirements Requirements(*ORE); - LoopVectorizationLegality LVL(L, PSE, DT, TLI, AA, F, TTI, GetLAA, LI, ORE, - &Requirements, &Hints); - if (!LVL.canVectorize()) { - DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n"); - emitMissedWarning(F, L, Hints, ORE); - return false; - } - - // Check the function attributes to find out if this function should be - // optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize(); - // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer Index: llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -3,10 +3,11 @@ ; CHECK: LV: Loop hints: force=enabled ; CHECK: LV: Loop hints: force=? +; CHECK: LV: Loop hints: force=? ; No more loops in the module ; CHECK-NOT: LV: Loop hints: force= -; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization -; CHECK: 1 loop-vectorize - Number of loops vectorized +; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 2 loop-vectorize - Number of loops vectorized target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -71,3 +72,29 @@ !3 = !{!3} +; +; This loop will be vectorized as the trip count is below the threshold but no +; scalar iterations are needed. +; +define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3 + %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv + %1 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %add = fadd fast float %0, %1 + store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 16 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4 + +for.end: + ret void +} + +!4 = !{!4} + Index: llvm/trunk/test/Transforms/LoopVectorize/small-loop.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/small-loop.ll +++ llvm/trunk/test/Transforms/LoopVectorize/small-loop.ll @@ -7,7 +7,7 @@ @c = common global [2048 x i32] zeroinitializer, align 16 ;CHECK-LABEL: @example1( -;CHECK-NOT: load <4 x i32> +;CHECK: load <4 x i32> ;CHECK: ret void define void @example1() nounwind uwtable ssp { br label %1 @@ -23,8 +23,8 @@ store i32 %6, i32* %7, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp eq i32 %lftr.wideiv, 8 ; <----- A really small trip count. - br i1 %exitcond, label %8, label %1 + %exitcond = icmp eq i32 %lftr.wideiv, 8 ; <----- A really small trip count + br i1 %exitcond, label %8, label %1 ; w/o scalar iteration overhead. ;