diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1189,12 +1189,12 @@
                              LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
-                             AssumptionCache *AC,
+                             AssumptionCache *AC, BlockFrequencyInfo *BFI,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
                              InterleavedAccessInfo &IAI)
       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
-        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+        TTI(TTI), TLI(TLI), DB(DB), AC(AC), BFI(BFI), ORE(ORE), TheFunction(F),
         Hints(Hints), InterleaveInfo(IAI) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
@@ -1851,6 +1851,11 @@
   /// \p VF is the vectorization factor chosen for the original loop.
   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 
+  /// A helper function that scales provide instruction cost to the
+  /// probability of it's execution relative to the loop header.
+  InstructionCost getInstCostScaledByFreq(InstructionCost &Cost,
+                                          const BasicBlock *BB) const;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -1876,6 +1881,8 @@
   /// Assumption cache.
   AssumptionCache *AC;
 
+  BlockFrequencyInfo *BFI;
+
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
@@ -4526,9 +4533,8 @@
     ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
 
     // Scale the cost by the probability of executing the predicated blocks.
-    // This assumes the predicated block for each vector lane is equally
-    // likely.
-    ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
+    ScalarizationCost =
+        getInstCostScaledByFreq(ScalarizationCost, I->getParent());
   }
   InstructionCost SafeDivisorCost = 0;
 
@@ -5580,6 +5586,26 @@
   return false;
 }
 
+InstructionCost LoopVectorizationCostModel::getInstCostScaledByFreq(
+    InstructionCost &Cost, const BasicBlock *BB) const {
+
+  if (!Cost.isValid()) {
+    return Cost;
+  }
+
+  if (!BFI)
+    return Cost / getReciprocalPredBlockProb();
+
+  auto HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
+
+  if (HeaderFreq == 0)
+    return Cost / getReciprocalPredBlockProb();
+
+  // Scale the total scalar cost by relative block probability.
+  return (*Cost.getValue() *
+          ((double)BFI->getBlockFreq(BB).getFrequency() / HeaderFreq));
+}
+
 VectorizationFactor
 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
@@ -6390,7 +6416,7 @@
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
-      BlockCost.first /= getReciprocalPredBlockProb();
+      BlockCost.first = getInstCostScaledByFreq(BlockCost.first, BB);
 
     Cost.first += BlockCost.first;
     Cost.second |= BlockCost.second;
@@ -6475,7 +6501,7 @@
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
   if (isPredicatedInst(I)) {
-    Cost /= getReciprocalPredBlockProb();
+    Cost = getInstCostScaledByFreq(Cost, I->getParent());
 
     // Add the cost of an i1 extract and a branch
     auto *Vec_i1Ty =
@@ -9851,8 +9877,8 @@
   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI);
 
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, BFI,
+                                ORE, F, &Hints, IAI);
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
@@ -10193,8 +10219,8 @@
   }
 
   // Use the cost model.
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
+  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, BFI,
+                                ORE, F, &Hints, IAI);
   CM.collectValuesToIgnore();
   CM.collectElementTypesForWidening();
 
@@ -10551,9 +10577,7 @@
     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
     ProfileSummaryInfo *PSI =
         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-    BlockFrequencyInfo *BFI = nullptr;
-    if (PSI && PSI->hasProfileSummary())
-      BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+    BlockFrequencyInfo *BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
     LoopVectorizeResult Result =
         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI);
     if (!Result.MadeAnyChange)
diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll
@@ -79,20 +79,20 @@
 ; DISABLED_MASKED_STRIDED-LABEL: 'test2'
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: store i16 %2, ptr %arrayidx7, align 2
-; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 43 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
+; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 71 for VF 16 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; DISABLED_MASKED_STRIDED:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: 'test2'
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2
-; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
+; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2
 ; ENABLED_MASKED_STRIDED:  LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2
diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i32-with-i8-index.ll
@@ -49,7 +49,7 @@
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %valB, ptr %out, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
+; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: store i32 %valB, ptr %out, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: store i32 %valB, ptr %out, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
--- a/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
+++ b/llvm/test/Analysis/CostModel/X86/masked-scatter-i64-with-i8-index.ll
@@ -49,7 +49,7 @@
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %valB, ptr %out, align 8
-; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
+; AVX512:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %valB, ptr %out, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: store i64 %valB, ptr %out, align 8
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -246,6 +246,8 @@
 ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis
+; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis
 ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -118,6 +118,8 @@
 ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
 ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
+; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
 ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
 ; CHECK-O23SZ-NEXT: Running pass: SROAPass on foo
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -182,6 +182,8 @@
 ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
+; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis
+; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll
@@ -17,7 +17,7 @@
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %inc = add nuw nsw i32 %i.016, 1
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %exitcond.not = icmp eq i32 %inc, %n
 ; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction:   br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
-; CHECK: LV: Scalar loop costs: 5.
+; CHECK: LV: Scalar loop costs: 4.
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %arrayidx = getelementptr inbounds i16, ptr %s, i32 %i.016
 ; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction:   %1 = load i16, ptr %arrayidx, align 2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicated_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/predicated_cost.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/predicated_cost.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Test that loop with conditional control flow is not vectorized due to not beneficial.
+; bb2 is taken with very low probability in scalar loop but unconditionally (with a mask)
+; executed when vectorized. 
+define void @test1(i64 %i4) #0 {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB3:%.*]] ], [ 0, [[BB0:%.*]] ]
+; CHECK-NEXT:    [[I8:%.*]] = load i64, ptr addrspace(1) null, align 8
+; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB2:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[I15:%.*]] = add i64 [[I8]], 12
+; CHECK-NEXT:    [[I16:%.*]] = zext i32 1 to i64
+; CHECK-NEXT:    [[I17:%.*]] = sub i64 [[I15]], [[I16]]
+; CHECK-NEXT:    [[NARROW_US1174_US_US:%.*]] = icmp ult i64 [[I17]], 0
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[I18:%.*]] = phi i1 [ [[NARROW_US1174_US_US]], [[BB2]] ], [ false, [[BB1]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[I20:%.*]] = icmp ult i64 [[INDVARS_IV]], [[I4:%.*]]
+; CHECK-NEXT:    br i1 [[I20]], label [[BB1]], label [[GUARDED_I:%.*]]
+; CHECK:       guarded.i:
+; CHECK-NEXT:    ret void
+;
+bb0:
+  br label %bb1
+
+bb1:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb3 ], [ 0, %bb0 ]
+  %i8 = load i64, ptr addrspace(1) null, align 8
+  br i1 undef, label %bb3, label %bb2, !prof !0
+
+bb2:
+  %i15 = add i64 %i8, 12
+  %i16 = zext i32 1 to i64
+  %i17 = sub i64 %i15, %i16
+  %narrow.us1174.us.us = icmp ult i64 %i17, 0
+  br label %bb3
+
+bb3:
+  %i18 = phi i1 [ %narrow.us1174.us.us, %bb2 ], [ false, %bb1 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %i20 = icmp ult i64 %indvars.iv, %i4
+  br i1 %i20, label %bb1, label %guarded.i
+
+guarded.i:
+  ret void
+}
+
+; Test that loop with conditional control flow is beneficial to vectorize.
+; bb2 is taken with high probability in scalar loop thus overhead from
+; masked execution doesn't outwheight vectorization benefits. 
+define void @test2(i64 %i4) #0 {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[I4:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[GUARDED_I:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB0:%.*]] ]
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB3:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I8:%.*]] = load i64, ptr addrspace(1) null, align 8
+; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB2:%.*]], !prof [[PROF4:![0-9]+]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[I15:%.*]] = add i64 [[I8]], 12
+; CHECK-NEXT:    [[I16:%.*]] = zext i32 1 to i64
+; CHECK-NEXT:    [[I17:%.*]] = sub i64 [[I15]], [[I16]]
+; CHECK-NEXT:    [[NARROW_US1174_US_US:%.*]] = icmp ult i64 [[I17]], 0
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[I18:%.*]] = phi i1 [ [[NARROW_US1174_US_US]], [[BB2]] ], [ false, [[BB1]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[I20:%.*]] = icmp ult i64 [[INDVARS_IV]], [[I4]]
+; CHECK-NEXT:    br i1 [[I20]], label [[BB1]], label [[GUARDED_I]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       guarded.i:
+; CHECK-NEXT:    ret void
+;
+bb0:
+  br label %bb1
+
+bb1:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb3 ], [ 0, %bb0 ]
+  %i8 = load i64, ptr addrspace(1) null, align 8
+  br i1 undef, label %bb3, label %bb2, !prof !1
+
+bb2:
+  %i15 = add i64 %i8, 12
+  %i16 = zext i32 1 to i64
+  %i17 = sub i64 %i15, %i16
+  %narrow.us1174.us.us = icmp ult i64 %i17, 0
+  br label %bb3
+
+bb3:
+  %i18 = phi i1 [ %narrow.us1174.us.us, %bb2 ], [ false, %bb1 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %i20 = icmp ult i64 %indvars.iv, %i4
+  br i1 %i20, label %bb1, label %guarded.i
+
+guarded.i:
+  ret void
+
+}
+
+
+attributes #0 = { "target-cpu"="skylake" }
+!0 = !{!"branch_weights", i32 160, i32 1}
+!1 = !{!"branch_weights", i32 1, i32 160}
diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
--- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -48,18 +48,18 @@
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
 ; CHECK-NEXT:   WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1>
 ; CHECK-NEXT:   EMIT vp<[[COND:%.+]]> = icmp ule ir<%iv> vp<[[BTC]]>
-; CHECK-NEXT:   WIDEN ir<%cond0> = icmp ult ir<%iv>, ir<13>
-; CHECK-NEXT:   WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20>
-; CHECK-NEXT: Successor(s): pred.store
+; CHECK-NEXT:  Successor(s): pred.store
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  <xVFxUF> pred.store: {
 ; CHECK-NEXT:    pred.store.entry:
-; CHECK-NEXT:      BRANCH-ON-MASK vp<[[COND]]>
+; CHECK-NEXT:      BRANCH-ON-MASK vp<%5>
 ; CHECK-NEXT:    Successor(s): pred.store.if, pred.store.continue
 ; CHECK-EMPTY:
 ; CHECK-NEXT:    pred.store.if:
-; CHECK-NEXT:      vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:      REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
+; CHECK-NEXT:      vp<%6>      = SCALAR-STEPS vp<%3>, ir<1>
+; CHECK-NEXT:      REPLICATE ir<%cond0> = icmp vp<%6>, ir<13>
+; CHECK-NEXT:      REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<%6>
+; CHECK-NEXT:      REPLICATE ir<%s> = select ir<%cond0>, ir<10>, ir<20>
 ; CHECK-NEXT:      REPLICATE store ir<%s>, ir<%gep>
 ; CHECK-NEXT:    Successor(s): pred.store.continue
 ; CHECK-EMPTY:
@@ -68,11 +68,14 @@
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Successor(s): loop.0
 ; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
-; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + vp<[[CAN_IV]]>
-; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]> vp<[[VEC_TC]]>
-; CHECK-NEXT: No successor
-; CHECK-NEXT: }
+; CHECK-NEXT:  loop.0:
+; CHECK-NEXT:    EMIT vp<%11> = VF * UF +  vp<%3>
+; CHECK-NEXT:    EMIT branch-on-count  vp<%11> vp<%1>
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:}
+
+
+
 define void @test(ptr %ptr) {
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -780,51 +780,52 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE4]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
-; CHECK:       pred.udiv.if1:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]]
+; CHECK:       pred.udiv.if3:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], [[X]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    [[TMP14:%.*]] = udiv i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP14]], i32 1
-; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
-; CHECK:       pred.udiv.continue2:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], [[PRED_UDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> [[TMP16]], <2 x i32> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP18]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = udiv i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
+; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
+; CHECK:       pred.udiv.continue4:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF3]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT2]], <i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[BROADCAST_SPLAT2]], <2 x i32> [[TMP15]], <2 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i32> [[VEC_PHI]], [[PREDPHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP18]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP17]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
@@ -843,7 +844,7 @@
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP39:![0-9]+]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[T7:%.*]] = phi i32 [ [[T6]], [[FOR_INC]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[T7]]
 ;
 ; UNROLL-NO-VF-LABEL: @predicated_udiv_scalarized_operand(