Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1536,11 +1536,18 @@
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
-  unsigned expectedCost(unsigned VF);
+  unsigned expectedCost(unsigned VF, bool &ActuallyVectorized);
+
+  /// Call expectedCost, but ignore the ActuallyVectorized output parameter.
+  unsigned expectedCost(unsigned VF) {
+    bool ActuallyVectorized = false;
+    return expectedCost(VF, ActuallyVectorized);
+  }
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  unsigned getInstructionCost(Instruction *I, unsigned VF);
+  unsigned getInstructionCost(Instruction *I, unsigned VF,
+                              bool &ActuallyVectorized);
 
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
@@ -5164,9 +5171,15 @@
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
-    float VectorCost = expectedCost(i) / (float)i;
+    bool ActuallyVectorized = false;
+    float VectorCost = expectedCost(i, ActuallyVectorized) / (float)i;
     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
+    if (!ActuallyVectorized && !ForceVectorization) {
+      DEBUG(dbgs() << "LV: Not considering vector loop of width " << i <<
+            " because it will not generate any vector instructions.\n");
+      continue;
+    }
     if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
@@ -5541,7 +5554,8 @@
   return RUs;
 }
 
-unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF,
+                                                  bool &ActuallyVectorized) {
   unsigned Cost = 0;
 
   // For each block.
@@ -5560,7 +5574,7 @@
       if (ValuesToIgnore.count(&*it))
         continue;
 
-      unsigned C = getInstructionCost(&*it, VF);
+      unsigned C = getInstructionCost(&*it, VF, ActuallyVectorized);
 
       // Check if we should override the cost.
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
@@ -5655,7 +5669,8 @@
 }
 
 unsigned
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF,
+                                               bool &ActuallyVectorized) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
   if (Legal->isUniformAfterVectorization(I))
@@ -5666,6 +5681,10 @@
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
+  if (VF > 1 && !ActuallyVectorized && !VectorTy->isVoidTy() &&
+      TTI.getNumberOfParts(VectorTy) < VF)
+    ActuallyVectorized = true;
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
Index: test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/PowerPC/vectorize-only-for-real.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+; Function Attrs: nounwind
+define zeroext i32 @test() #0 {
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
+
+entry:
+  %a = alloca [1600 x i32], align 4
+  %c = alloca [1600 x i32], align 4
+  %0 = bitcast [1600 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %0) #3
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %1 = bitcast [1600 x i32]* %c to i8*
+  call void @llvm.lifetime.start(i64 6400, i8* %1) #3
+  %arraydecay = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 0
+  %call = call signext i32 @bar(i32* %arraydecay, i32* %arraydecay1) #3
+  br label %for.body6
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv25 = phi i64 [ 0, %entry ], [ %indvars.iv.next26, %for.body ]
+  %arrayidx = getelementptr inbounds [1600 x i32], [1600 x i32]* %a, i64 0, i64 %indvars.iv25
+  %2 = trunc i64 %indvars.iv25 to i32
+  store i32 %2, i32* %arrayidx, align 4
+  %indvars.iv.next26 = add nuw nsw i64 %indvars.iv25, 1
+  %exitcond27 = icmp eq i64 %indvars.iv.next26, 1600
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  call void @llvm.lifetime.end(i64 6400, i8* nonnull %1) #3
+  call void @llvm.lifetime.end(i64 6400, i8* %0) #3
+  ret i32 %add
+
+for.body6:                                        ; preds = %for.body6, %for.cond.cleanup
+  %indvars.iv = phi i64 [ 0, %for.cond.cleanup ], [ %indvars.iv.next, %for.body6 ]
+  %s.022 = phi i32 [ 0, %for.cond.cleanup ], [ %add, %for.body6 ]
+  %arrayidx8 = getelementptr inbounds [1600 x i32], [1600 x i32]* %c, i64 0, i64 %indvars.iv
+  %3 = load i32, i32* %arrayidx8, align 4
+  %add = add i32 %3, %s.022
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1600
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+declare signext i32 @bar(i32*, i32*) #2
+
+attributes #0 = { nounwind "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "target-cpu"="a2q" "target-features"="+qpx,-altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" }
+attributes #3 = { nounwind }
+