Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,6 +55,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/LoopInfo.h" @@ -98,6 +99,9 @@ using namespace llvm; using namespace llvm::PatternMatch; +STATISTIC(LoopsVectorized, "Number of loops vectorized"); +STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); + static cl::opt VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Sets the SIMD width. Zero is autoselect.")); @@ -812,9 +816,11 @@ /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is - /// possible. + /// possible. If Force is true, then vectorization will be made even if + /// scalar cost is lower. VectorizationFactor selectVectorizationFactor(bool OptForSize, - unsigned UserVF); + unsigned UserVF, + bool Force); /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as @@ -1080,6 +1086,8 @@ for (Loop *L : *LI) addInnerLoop(*L, Worklist); + LoopsAnalyzed += Worklist.size(); + // Now walk the identified inner loops. bool Changed = false; while (!Worklist.empty()) @@ -1132,13 +1140,13 @@ // Check the function attributes to find out if this function should be // optimized for size. - Function *F = L->getHeader()->getParent(); + const Function *F = L->getHeader()->getParent(); bool OptForSize = Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize); // Compute the weighted frequency of this loop being executed and see if it // is less than 20% of the function entry baseline frequency. Note that we - // always have a canonical loop here because we think we *can* vectoriez. + // always have a canonical loop here because we think we *can* vectorize. // FIXME: This is hidden behind a flag due to pervasive problems with // exactly what block frequency models. if (LoopVectorizeWithBlockFrequency) { @@ -1147,7 +1155,7 @@ OptForSize = true; } - // Check the function attributes to see if implicit floats are allowed.a + // Check the function attributes to see if implicit floats are allowed. // FIXME: This check doesn't seem possibly correct -- what if the loop is // an integer loop and the vector instructions selected are purely integer // vector instructions? @@ -1158,8 +1166,9 @@ } // Select the optimal vectorization factor. - const LoopVectorizationCostModel::VectorizationFactor VF = - CM.selectVectorizationFactor(OptForSize, Hints.Width); + LoopVectorizationCostModel::VectorizationFactor VF = + CM.selectVectorizationFactor(OptForSize, Hints.Width, Hints.Force == 1); + // Select the unroll factor. const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width, VF.Cost); @@ -1182,6 +1191,7 @@ // If we decided that it is *legal* to vectorize the loop then do it. InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); + ++LoopsVectorized; } // Mark the loop as already vectorized to avoid vectorizing again. @@ -4999,7 +5009,8 @@ LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, - unsigned UserVF) { + unsigned UserVF, + bool Force) { // Width 1 means no vectorize VectorizationFactor Factor = { 1U, 0U }; if (OptForSize && Legal->getRuntimePointerCheck()->Need) { @@ -5069,13 +5080,20 @@ } float Cost = expectedCost(1); + const float ScalarCost = Cost; unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n"); + + if (VF > 1 && Force) { + Width = 2; + Cost = expectedCost(Width) / (float)Width; + } + for (unsigned i=2; i <= VF; i*=2) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. - float VectorCost = expectedCost(i) / (float)i; + const float VectorCost = expectedCost(i) / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (VectorCost < Cost) { @@ -5084,7 +5102,10 @@ } } - DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n"); + DEBUG(if (Force && Cost >= ScalarCost) dbgs() + << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"); + DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n"); Factor.Width = Width; Factor.Cost = Width * Cost; return Factor; Index: test/Transforms/LoopVectorize/vect.omp.force.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/vect.omp.force.ll @@ -0,0 +1,108 @@ +; RUN: opt < %s -O2 -force-vector-unroll=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Loop from "vectorized" +; CHECK: LV: Loop hints: force=enabled +; Loop from "notvectorized" +; CHECK: LV: Loop hints: force=? +; No more loops in the module +; CHECK-NOT: LV: Loop hints: force= +; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization +; CHECK: 1 loop-vectorize - Number of loops vectorized + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; See http://reviews.llvm.org/D3348 for details. + +; +; Test #1 +; +; Ensure that the loop only vectorized if "#pragma omp simd" provided. +; +; The source C code is: +; void rotated(float *a, int size) +; { +; int t = 0; +; #pragma omp simd +; for (int i = 0; i < size; ++i) { +; a[i] = a[i-5] * a[i+2]; +; ++t; +; } +;} + +define void @vectorized(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %0 = add nsw i64 %indvars.iv, -5 + %arrayidx = getelementptr inbounds float* %a, i64 %0 + %1 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1 + %2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds float* %a, i64 %2 + %3 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1 + %mul = fmul float %1, %3 + %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header, !llvm.loop !1 + +for.end: + ret void +} + +!1 = metadata !{metadata !1, metadata !2} +!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true} + +; +; Test #2 +; +; Ensure that the loop is NOT vectorized if "#pragma omp simd" is NOT provided explicitly. +; +; The source C code is: +; void rotated(float *a, int size) +; { +; int t = 0; +; for (int i = 0; i < size; ++i) { +; a[i] = a[i-5] * a[i+2]; +; ++t; +; } +;} + +define void @not_vectorized(float* nocapture %a, i64 %size) { +entry: + %cmp1 = icmp sgt i64 %size, 0 + br i1 %cmp1, label %for.header, label %for.end + +for.header: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %cmp2 = icmp sgt i64 %indvars.iv, %size + br i1 %cmp2, label %for.end, label %for.body + +for.body: + + %0 = add nsw i64 %indvars.iv, -5 + %arrayidx = getelementptr inbounds float* %a, i64 %0 + %1 = load float* %arrayidx, align 4 + %2 = add nsw i64 %indvars.iv, 2 + %arrayidx2 = getelementptr inbounds float* %a, i64 %2 + %3 = load float* %arrayidx2, align 4 + %mul = fmul float %1, %3 + %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx4, align 4 + + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + br label %for.header + +for.end: + ret void +} \ No newline at end of file