Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3542,10 +3542,7 @@
 /// \brief Estimate the overhead of scalarizing a value based on its type.
 /// Insert and Extract are set if the result needs to be inserted and/or
 /// extracted from vectors.
-/// If the instruction is also to be predicated, add the cost of a PHI
-/// node to the insertion cost.
 static unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract,
-                                         bool Predicated,
                                          const TargetTransformInfo &TTI) {
   if (Ty->isVoidTy())
     return 0;
@@ -3556,33 +3553,23 @@
   for (unsigned I = 0, E = Ty->getVectorNumElements(); I < E; ++I) {
     if (Extract)
       Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, Ty, I);
-    if (Insert) {
+    if (Insert)
       Cost += TTI.getVectorInstrCost(Instruction::InsertElement, Ty, I);
-      if (Predicated)
-        Cost += TTI.getCFInstrCost(Instruction::PHI);
-    }
   }
 
-  // If we have a predicated instruction, it may not be executed for each
-  // vector lane. Scale the cost by the probability of executing the
-  // predicated block.
-  if (Predicated)
-    Cost /= getReciprocalPredBlockProb();
-
   return Cost;
 }
 
 /// \brief Estimate the overhead of scalarizing an Instruction based on the
 /// types of its operands and return value.
 static unsigned getScalarizationOverhead(SmallVectorImpl<Type *> &OpTys,
-                                         Type *RetTy, bool Predicated,
+                                         Type *RetTy,
                                          const TargetTransformInfo &TTI) {
   unsigned ScalarizationCost =
-      getScalarizationOverhead(RetTy, true, false, Predicated, TTI);
+      getScalarizationOverhead(RetTy, true, false, TTI);
 
   for (Type *Ty : OpTys)
-    ScalarizationCost +=
-        getScalarizationOverhead(Ty, false, true, Predicated, TTI);
+    ScalarizationCost += getScalarizationOverhead(Ty, false, true, TTI);
 
   return ScalarizationCost;
 }
@@ -3590,7 +3577,6 @@
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
-                                         bool Predicated,
                                          const TargetTransformInfo &TTI) {
   if (VF == 1)
     return 0;
@@ -3602,7 +3588,7 @@
   for (unsigned OpInd = 0; OpInd < OperandsNum; ++OpInd)
     OpTys.push_back(ToVectorTy(I->getOperand(OpInd)->getType(), VF));
 
-  return getScalarizationOverhead(OpTys, RetTy, Predicated, TTI);
+  return getScalarizationOverhead(OpTys, RetTy, TTI);
 }
 
 // Estimate cost of a call instruction CI if it were vectorized with factor VF.
@@ -3635,7 +3621,7 @@
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, false, TTI);
+  unsigned ScalarizationCost = getScalarizationOverhead(Tys, RetTy, TTI);
 
   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
 
@@ -6536,10 +6522,27 @@
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF > 1 && Legal->isScalarWithPredication(I))
-      return VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy) /
-                 getReciprocalPredBlockProb() +
-             getScalarizationOverhead(I, VF, true, TTI);
+    if (VF > 1 && Legal->isScalarWithPredication(I)) {
+      unsigned Cost = 0;
+
+      // These instructions have a non-void type, so account for the phi nodes
+      // that we will create. This cost is likely to be zero. The phi node
+      // cost, if any, should be scaled by the block probability because it
+      // models a copy at the end of each predicated block.
+      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+
+      // The cost of the non-predicated instruction.
+      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+
+      // The cost of insertelement and extractelement instructions needed for
+      // scalarization.
+      Cost += getScalarizationOverhead(I, VF, TTI);
+
+      // Scale the cost by the probability of executing the predicated blocks.
+      // This assumes the predicated block for each vector lane is equally
+      // likely.
+      return Cost / getReciprocalPredBlockProb();
+    }
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -6695,7 +6698,7 @@
 
       // Get the overhead of the extractelement and insertelement instructions
       // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, false, TTI);
+      Cost += getScalarizationOverhead(I, VF, TTI);
 
       return Cost;
     }
@@ -6782,7 +6785,7 @@
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
-           getScalarizationOverhead(I, VF, false, TTI);
+           getScalarizationOverhead(I, VF, TTI);
   } // end of switch.
 }
 
Index: llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
===================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@@ -0,0 +1,53 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; Check predication-related cost calculations, including scalarization overhead
+; and block probability scaling. Note that the functionality being tested is
+; not specific to AArch64. We specify a target to get actual values for the
+; instruction costs.
+
+; CHECK-LABEL: predicated_udiv
+;
+; This test checks that we correctly compute the cost of the predicated udiv
+; instruction. If we assume the block probability is 50%, we compute the cost
+; as:
+;
+; Cost for vector lane zero:
+;   (udiv(1) + 2 * extractelement(0) + insertelement(0)) / 2 = 0
+; Cost for vector lane one:
+;   (udiv(1) + 2 * extractelement(3) + insertelement(3)) / 2 = 5
+;
+; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3
+; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3
+;
+define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  br i1 %c, label %if.then, label %for.inc
+
+if.then:
+  %tmp4 = udiv i32 %tmp2, %tmp3
+  br label %for.inc
+
+for.inc:
+  %tmp5 = phi i32 [ %tmp3, %for.body ], [ %tmp4, %if.then]
+  %tmp6 = add i32 %r, %tmp5
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}