Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -818,7 +818,8 @@ /// then this vectorization factor will be selected if vectorization is /// possible. VectorizationFactor selectVectorizationFactor(bool OptForSize, - unsigned UserVF); + unsigned UserVF, + bool ForceVectorization); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as @@ -890,13 +891,17 @@ unsigned Width; /// Vectorization unroll factor. unsigned Unroll; - /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled) - int Force; + /// Vectorization forced + enum ForceKind { + FK_Undefined = -1, ///< Not selected. + FK_Disabled = 0, ///< Forcing disabled. + FK_Enabled = 1, ///< Forcing enabled. + } Force; LoopVectorizeHints(const Loop *L, bool DisableUnrolling) : Width(VectorizationFactor) , Unroll(DisableUnrolling ? 1 : VectorizationUnroll) - , Force(-1) + , Force(FK_Undefined) , LoopID(L->getLoopID()) { getHints(L); // The command line options override any loop metadata except for when @@ -1009,7 +1014,8 @@ DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n"); } else if (Hint == "enable") { if (C->getBitWidth() == 1) - Force = Val; + Force = Val == 1 ? LoopVectorizeHints::FK_Enabled + : LoopVectorizeHints::FK_Disabled; else DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n"); } else { @@ -1105,18 +1111,20 @@ LoopVectorizeHints Hints(L, DisableUnrolling); DEBUG(dbgs() << "LV: Loop hints:" - << " force=" << (Hints.Force == 0 - ? "disabled" - : (Hints.Force == 1 ? "enabled" : "?")) - << " width=" << Hints.Width << " unroll=" << Hints.Unroll - << "\n"); - - if (Hints.Force == 0) { + << " force=" + << (Hints.Force == LoopVectorizeHints::FK_Disabled + ? "disabled" + : (Hints.Force == LoopVectorizeHints::FK_Enabled + ? "enabled" + : "?")) << " width=" << Hints.Width + << " unroll=" << Hints.Unroll << "\n"); + + if (Hints.Force == LoopVectorizeHints::FK_Disabled) { DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n"); return false; } - if (!AlwaysVectorize && Hints.Force != 1) { + if (!AlwaysVectorize && Hints.Force != LoopVectorizeHints::FK_Enabled) { DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n"); return false; } @@ -1126,6 +1134,23 @@ return false; } + // Check the loop for a trip count threshold: + // do not vectorize loops with a tiny trip count. + { + BasicBlock *Latch = L->getLoopLatch(); + const unsigned TC = SE->getSmallConstantTripCount(L, Latch); + if (TC > 0u && TC < TinyTripCountVectorThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " + << "This loop is not worth vectorizing."); + if (Hints.Force == LoopVectorizeHints::FK_Enabled) + DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); + else { + DEBUG(dbgs() << "\n"); + return false; + } + } + } + // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); if (!LVL.canVectorize()) { @@ -1139,8 +1164,8 @@ // Check the function attributes to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); - bool OptForSize = - Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = Hints.Force != LoopVectorizeHints::FK_Enabled && + F->hasFnAttribute(Attribute::OptimizeForSize); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we @@ -1149,7 +1174,8 @@ // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader()); - if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq) + if (Hints.Force != LoopVectorizeHints::FK_Enabled && + LoopEntryFreq < ColdEntryFreq) OptForSize = true; } @@ -1165,7 +1191,10 @@ // Select the optimal vectorization factor. const LoopVectorizationCostModel::VectorizationFactor VF = - CM.selectVectorizationFactor(OptForSize, Hints.Width); + CM.selectVectorizationFactor(OptForSize, Hints.Width, + Hints.Force == + LoopVectorizeHints::FK_Enabled); + // Select the unroll factor. const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, VF.Cost); @@ -3299,15 +3328,6 @@ return false; } - // Do not loop-vectorize loops with a tiny trip count. - BasicBlock *Latch = TheLoop->getLoopLatch(); - unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); - if (TC > 0u && TC < TinyTripCountVectorThreshold) { - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << - "This loop is not worth vectorizing.\n"); - return false; - } - // Check if we can vectorize the instructions and CFG in this loop. if (!canVectorizeInstrs()) { DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); @@ -5006,7 +5026,8 @@ LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF, + bool ForceVectorization) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { @@ -5076,8 +5097,15 @@ } float Cost = expectedCost(1); + const float ScalarCost = Cost; unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); + + if (ForceVectorization && VF > 1) { + Width = 2; + Cost = expectedCost(Width) / (float)Width; + } + for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of @@ -5091,6 +5119,9 @@ } } + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); Factor.Width = Width; Factor.Cost = Width * Cost; Index: test/Transforms/LoopVectorize/X86/vect.omp.force.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/vect.omp.force.ll @@ -0,0 +1,93 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: LV: Loop hints: force=enabled +; CHECK: LV: Loop hints: force=? +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; The source code for the test: +; +; #include +; void foo(float* restrict A, float * restrict B, int size) +; { +; for (int i = 0; i < size; ++i) A[i] = sinf(B[i]); +; } +; + +; +; This loop will be vectorized, although the scalar cost is lower than any of vector costs, but vectorization is explicitly forced in metadata. +; + +define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) { +entry: + %cmp6 = icmp sgt i32 %size, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %call = tail call float @llvm.sin.f32(float %0) + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1 + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true} + +; +; This method will not be vectorized, as scalar cost is lower than any of vector costs. +; + +define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) { +entry: + %cmp6 = icmp sgt i32 %size, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3 + %call = tail call float @llvm.sin.f32(float %0) + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %size + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3 + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + +declare float @llvm.sin.f32(float) nounwind readnone + +; Dummy metadata +!3 = metadata !{metadata !3} + Index: test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S -vectorizer-min-trip-count=21 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: LV: Loop hints: force=enabled +; CHECK: LV: Loop hints: force=? +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; +; The source code for the test: +; +; void foo(float* restrict A, float* restrict B) +; { +; for (int i = 0; i < 20; ++i) A[i] += B[i]; +; } +; + +; +; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. +; +define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %add = fadd fast float %0, %1 + store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true} + +; +; This loop will not be vectorized as the trip count is below the threshold. +; +define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3 + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %add = fadd fast float %0, %1 + store float %add, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 20 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 + +for.end: + ret void +} + +!3 = metadata !{metadata !3} +