Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,6 +55,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -98,6 +99,9 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
@@ -812,9 +816,11 @@
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
-  /// possible.
+  /// possible. If Force is true, then vectorization will be made even if
+  /// scalar cost is lower.
   VectorizationFactor selectVectorizationFactor(bool OptForSize,
-                                                unsigned UserVF);
+                                                unsigned UserVF,
+                                                bool Force);
 
   /// \return The size (in bits) of the widest type in the code that
   /// needs to be vectorized. We ignore values that remain scalar such as
@@ -1080,6 +1086,8 @@
     for (Loop *L : *LI)
       addInnerLoop(*L, Worklist);
 
+    LoopsAnalyzed += Worklist.size();
+
     // Now walk the identified inner loops.
     bool Changed = false;
     while (!Worklist.empty())
@@ -1132,13 +1140,13 @@
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
-    Function *F = L->getHeader()->getParent();
+    const Function *F = L->getHeader()->getParent();
     bool OptForSize =
         Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
 
     // Compute the weighted frequency of this loop being executed and see if it
     // is less than 20% of the function entry baseline frequency. Note that we
-    // always have a canonical loop here because we think we *can* vectoriez.
+    // always have a canonical loop here because we think we *can* vectorize.
     // FIXME: This is hidden behind a flag due to pervasive problems with
     // exactly what block frequency models.
     if (LoopVectorizeWithBlockFrequency) {
@@ -1147,7 +1155,7 @@
         OptForSize = true;
     }
 
-    // Check the function attributes to see if implicit floats are allowed.a
+    // Check the function attributes to see if implicit floats are allowed.
     // FIXME: This check doesn't seem possibly correct -- what if the loop is
     // an integer loop and the vector instructions selected are purely integer
     // vector instructions?
@@ -1158,8 +1166,9 @@
     }
 
     // Select the optimal vectorization factor.
-    const LoopVectorizationCostModel::VectorizationFactor VF =
-                          CM.selectVectorizationFactor(OptForSize, Hints.Width);
+    LoopVectorizationCostModel::VectorizationFactor VF =
+        CM.selectVectorizationFactor(OptForSize, Hints.Width, Hints.Force == 1);
+
     // Select the unroll factor.
     const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                         VF.Cost);
@@ -1182,6 +1191,7 @@
       // If we decided that it is *legal* to vectorize the loop then do it.
       InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
       LB.vectorize(&LVL);
+      ++LoopsVectorized;
     }
 
     // Mark the loop as already vectorized to avoid vectorizing again.
@@ -4999,7 +5009,8 @@
 
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                      unsigned UserVF) {
+                                                      unsigned UserVF,
+                                                      bool Force) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
@@ -5069,13 +5080,20 @@
   }
 
   float Cost = expectedCost(1);
+  const float ScalarCost = Cost;
   unsigned Width = 1;
   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
+
+  if (VF > 1 && Force) {
+    Width = 2;
+    Cost = expectedCost(Width) / (float)Width;
+  }
+
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
-    float VectorCost = expectedCost(i) / (float)i;
+    const float VectorCost = expectedCost(i) / (float)i;
     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
     if (VectorCost < Cost) {
@@ -5084,7 +5102,10 @@
     }
   }
 
-  DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
+  DEBUG(if (Force && Cost >= ScalarCost) dbgs()
+        << "LV: Vectorization seems to be not beneficial, "
+        << "but was forced by a user.\n");
+  DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
   Factor.Width = Width;
   Factor.Cost = Width * Cost;
   return Factor;
Index: test/Transforms/LoopVectorize/vect.omp.force.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/vect.omp.force.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -O2 -force-vector-unroll=2 -force-vector-width=4 -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; Loop from "vectorized"
+; CHECK: LV: Loop hints: force=enabled
+; Loop from "notvectorized"
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; See http://reviews.llvm.org/D3348 for details.
+
+;
+; Test #1
+;
+; Ensure that the loop only vectorized if "#pragma omp simd" provided.
+;
+; The source C code is:
+; void rotated(float *a, int size)
+; {
+;   int t = 0;
+;   #pragma omp simd
+;   for (int i = 0; i < size; ++i) {
+;     a[i] = a[i-5] * a[i+2];
+;     ++t;
+;   }
+;}
+
+define void @vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %0 = add nsw i64 %indvars.iv, -5
+  %arrayidx = getelementptr inbounds float* %a, i64 %0
+  %1 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %2
+  %3 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+
+;
+; Test #2
+;
+; Ensure that the loop is NOT vectorized if "#pragma omp simd" is NOT provided explicitly.
+;
+; The source C code is:
+; void rotated(float *a, int size)
+; {
+;   int t = 0;
+;   for (int i = 0; i < size; ++i) {
+;     a[i] = a[i-5] * a[i+2];
+;     ++t;
+;   }
+;}
+
+define void @not_vectorized(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %0 = add nsw i64 %indvars.iv, -5
+  %arrayidx = getelementptr inbounds float* %a, i64 %0
+  %1 = load float* %arrayidx, align 4
+  %2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %2
+  %3 = load float* %arrayidx2, align 4
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header
+
+for.end:
+  ret void
+}
\ No newline at end of file