Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -342,7 +342,13 @@
 ///
 /// TODO: We should use actual block probability here, if available. Currently,
 ///       we always assume predicated blocks have a 50% chance of executing.
-static unsigned getReciprocalPredBlockProb() { return 2; }
+
+static unsigned getReciprocalPredBlockProb1(BlockFrequencyInfo *BFI,
+                                            BasicBlock *BB,
+                                            BasicBlock *HeaderBB) {
+  return ((BFI->getBlockFreq(HeaderBB)).getFrequency()) /
+         ((BFI->getBlockFreq(BB)).getFrequency());
+}
 
 /// A helper function that adds a 'fast' flag to floating-point operations.
 static Value *addFastMathFlag(Value *V) {
@@ -1815,9 +1821,10 @@
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
-                             const LoopVectorizeHints *Hints)
+                             const LoopVectorizeHints *Hints,
+                             BlockFrequencyInfo *BFI )
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
+        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), BFI(BFI) {}
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization should be avoided up front.
@@ -2168,6 +2175,8 @@
   const Function *TheFunction;
   /// Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
+  /// Block Frequency Info
+  BlockFrequencyInfo *BFI;
   /// Values to ignore in the cost model.
   SmallPtrSet<const Value *, 16> ValuesToIgnore;
   /// Values to ignore in the cost model when VF > 1.
@@ -6928,8 +6937,10 @@
                               ToVectorTy(J->getType(),VF), false, true);
       }
 
+
     // Scale the total scalar cost by block probability.
-    ScalarCost /= getReciprocalPredBlockProb();
+    ScalarCost /=
+        getReciprocalPredBlockProb1(BFI,I->getParent(),TheLoop->getHeader());
 
     // Compute the discount. A non-negative discount means the vector version
     // of the instruction costs more, and scalarizing would be beneficial.
@@ -6984,7 +6995,8 @@
     // the predicated block. Thus, scale the block's cost by the probability of
     // executing it.
     if (VF == 1 && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb();
+      BlockCost.first /=
+          getReciprocalPredBlockProb1(BFI, BB, TheLoop->getHeader());
 
     Cost.first += BlockCost.first;
     Cost.second |= BlockCost.second;
@@ -7055,7 +7067,8 @@
   // lane. Scale the cost by the probability of executing the predicated
   // block.
   if (Legal->isScalarWithPredication(I))
-    Cost /= getReciprocalPredBlockProb();
+    Cost /=
+        getReciprocalPredBlockProb1(BFI, I->getParent(), TheLoop->getHeader());
 
   return Cost;
 }
@@ -7398,7 +7411,8 @@
       // Scale the cost by the probability of executing the predicated blocks.
       // This assumes the predicated block for each vector lane is equally
       // likely.
-      return Cost / getReciprocalPredBlockProb();
+      return Cost / getReciprocalPredBlockProb1(BFI, I->getParent(),
+                                                TheLoop->getHeader());
     }
     LLVM_FALLTHROUGH;
   case Instruction::Add:
@@ -7862,7 +7876,7 @@
 
   // Use the cost model.
   LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints);
+                                &Hints, BFI);
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
Index: test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -12,7 +12,7 @@
 ; %tmp4 a lower scalarization overhead.
 ;
 ; COST-LABEL:  predicated_udiv_scalarized_operand
-; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+; COST:        LV: Found an estimated cost of 11 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
 ;
 ; CHECK-LABEL: @predicated_udiv_scalarized_operand(
 ; CHECK:       vector.body:
@@ -22,13 +22,13 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], %broadcast.splat2
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
 ; CHECK:       [[PRED_UDIV_IF]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
 ; CHECK:       [[PRED_UDIV_CONTINUE]]:
@@ -37,9 +37,8 @@
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
 ; CHECK:       [[PRED_UDIV_IF1]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x
-; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
 ; CHECK:       [[PRED_UDIV_CONTINUE2]]: