Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -818,7 +818,8 @@
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
   VectorizationFactor selectVectorizationFactor(bool OptForSize,
-                                                unsigned UserVF);
+                                                unsigned UserVF,
+                                                bool ForceVectorization);
 
   /// \return The size (in bits) of the widest type in the code that
   /// needs to be vectorized. We ignore values that remain scalar such as
@@ -890,13 +891,17 @@
   unsigned Width;
   /// Vectorization unroll factor.
   unsigned Unroll;
-  /// Vectorization forced (-1 not selected, 0 force disabled, 1 force enabled)
-  int Force;
+  /// Vectorization forced
+  enum ForceKind {
+    FK_Undefined = -1, ///< Not selected.
+    FK_Disabled = 0,   ///< Forcing disabled.
+    FK_Enabled = 1,    ///< Forcing enabled.
+  } Force;
 
   LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
   : Width(VectorizationFactor)
   , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
-  , Force(-1)
+  , Force(FK_Undefined)
   , LoopID(L->getLoopID()) {
     getHints(L);
     // The command line options override any loop metadata except for when
@@ -1009,7 +1014,8 @@
         DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
     } else if (Hint == "enable") {
       if (C->getBitWidth() == 1)
-        Force = Val;
+        Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
+                         : LoopVectorizeHints::FK_Disabled;
       else
         DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
     } else {
@@ -1105,18 +1111,20 @@
     LoopVectorizeHints Hints(L, DisableUnrolling);
 
     DEBUG(dbgs() << "LV: Loop hints:"
-                 << " force=" << (Hints.Force == 0
-                                      ? "disabled"
-                                      : (Hints.Force == 1 ? "enabled" : "?"))
-                 << " width=" << Hints.Width << " unroll=" << Hints.Unroll
-                 << "\n");
-
-    if (Hints.Force == 0) {
+                 << " force="
+                 << (Hints.Force == LoopVectorizeHints::FK_Disabled
+                         ? "disabled"
+                         : (Hints.Force == LoopVectorizeHints::FK_Enabled
+                                ? "enabled"
+                                : "?")) << " width=" << Hints.Width
+                 << " unroll=" << Hints.Unroll << "\n");
+
+    if (Hints.Force == LoopVectorizeHints::FK_Disabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
       return false;
     }
 
-    if (!AlwaysVectorize && Hints.Force != 1) {
+    if (!AlwaysVectorize && Hints.Force != LoopVectorizeHints::FK_Enabled) {
       DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
       return false;
     }
@@ -1139,8 +1147,8 @@
     // Check the function attributes to find out if this function should be
     // optimized for size.
     Function *F = L->getHeader()->getParent();
-    bool OptForSize =
-        Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
+    bool OptForSize = Hints.Force != LoopVectorizeHints::FK_Enabled &&
+                      F->hasFnAttribute(Attribute::OptimizeForSize);
 
     // Compute the weighted frequency of this loop being executed and see if it
     // is less than 20% of the function entry baseline frequency. Note that we
@@ -1149,7 +1157,8 @@
     // exactly what block frequency models.
     if (LoopVectorizeWithBlockFrequency) {
       BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
-      if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
+      if (Hints.Force != LoopVectorizeHints::FK_Enabled &&
+          LoopEntryFreq < ColdEntryFreq)
         OptForSize = true;
     }
 
@@ -1165,7 +1174,10 @@
 
     // Select the optimal vectorization factor.
     const LoopVectorizationCostModel::VectorizationFactor VF =
-                          CM.selectVectorizationFactor(OptForSize, Hints.Width);
+        CM.selectVectorizationFactor(OptForSize, Hints.Width,
+                                     Hints.Force ==
+                                         LoopVectorizeHints::FK_Enabled);
+
     // Select the unroll factor.
     const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                         VF.Cost);
@@ -5006,7 +5018,8 @@
 
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                      unsigned UserVF) {
+                                                      unsigned UserVF,
+                                                      bool ForceVectorization) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
@@ -5076,8 +5089,15 @@
   }
 
   float Cost = expectedCost(1);
+  const float ScalarCost = Cost;
   unsigned Width = 1;
   DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
+
+  if (ForceVectorization && VF > 1) {
+    Width = 2;
+    Cost = expectedCost(Width) / (float)Width;
+  }
+
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
@@ -5091,6 +5111,9 @@
     }
   }
 
+  DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+        << "LV: Vectorization seems to be not beneficial, "
+        << "but was forced by a user.\n");
   DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
   Factor.Width = Width;
   Factor.Cost = Width * Cost;
Index: test/Transforms/LoopVectorize/X86/vect.omp.force.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/X86/vect.omp.force.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: LV: Loop hints: force=enabled
+; CHECK: LV: Loop hints: force=?
+; No more loops in the module
+; CHECK-NOT: LV: Loop hints: force=
+; CHECK: 2 loop-vectorize               - Number of loops analyzed for vectorization
+; CHECK: 1 loop-vectorize               - Number of loops vectorized
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; The source code for the test:
+;
+; #include <math.h>
+; void foo(float* restrict A, float * restrict B, int size)
+; {
+;   for (int i = 0; i < size; ++i) A[i] = sinf(B[i]);
+; }
+;
+
+;
+; This loop will be vectorized, despite the scalar cost is lower than any of vector costs, as vectorization is explicitly forced in metadata.
+;
+
+define void @vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+entry:
+  %cmp6 = icmp sgt i32 %size, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!1 = metadata !{metadata !1, metadata !2}
+!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
+
+;
+; This method will not be vectorized, as scalar cost is lower than any of vector costs.
+;
+
+define void @not_vectorized(float* noalias nocapture %A, float* noalias nocapture %B, i32 %size) {
+entry:
+  %cmp6 = icmp sgt i32 %size, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
+  %call = tail call float @llvm.sin.f32(float %0)
+  %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
+  store float %call, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %size
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+declare float @llvm.sin.f32(float) nounwind readnone
+
+; Dummy metadata
+!3 = metadata !{metadata !3}
+