diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1623,10 +1623,16 @@
   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
                                     bool &NeedToScalarize) const;
 
+  /// TODO:
+  std::pair<unsigned, unsigned>
+  calculateMainEpilogTripCount(const VectorizationFactor &VF,
+                               unsigned TotalTripCount) const;
+
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
   /// that of B.
   bool isMoreProfitable(const VectorizationFactor &A,
-                        const VectorizationFactor &B) const;
+                        const VectorizationFactor &B,
+                        const bool IsMainLoop) const;
 
   /// Invalidates decisions already taken by the cost model.
   void invalidateCostModelingDecisions() {
@@ -6036,25 +6042,59 @@
   return MaxVF;
 }
 
-bool LoopVectorizationCostModel::isMoreProfitable(
-    const VectorizationFactor &A, const VectorizationFactor &B) const {
-  InstructionCost CostA = A.Cost;
-  InstructionCost CostB = B.Cost;
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::calculateMainEpilogTripCount(
+    const VectorizationFactor &VF, unsigned TotalTripCount) const {
+  uint64_t MainTripCount = TotalTripCount / VF.Width.getKnownMinValue();
+  uint64_t EpilogTripCount = TotalTripCount % VF.Width.getKnownMinValue();
+
+  // In "foldTailByMasking" mode remainder iterations are executed as part of
+  // the main vector loop. That means all remainder iterations will be executed
+  // as one masked vector iterations.
+  if (EpilogTripCount != 0 && foldTailByMasking()) {
+    ++MainTripCount;
+    EpilogTripCount = 0;
+  }
+
+  // In "requiresScalarEpilogue" mode there should be at least one iteration
+  // executed in the epilogue loop. If all iterations end up being executed
+  // as part of the main vector loop forward one vector iteration to remainder
+  // loop.
+  if (EpilogTripCount == 0 && requiresScalarEpilogue(VF.Width)) {
+    assert(MainTripCount > 0 && "Trip count is expected to be > 0");
+    --MainTripCount;
+    EpilogTripCount = VF.Width.getKnownMinValue();
+  }
+  return {MainTripCount, EpilogTripCount};
+}
+
+bool LoopVectorizationCostModel::isMoreProfitable(const VectorizationFactor &A,
+                                                  const VectorizationFactor &B,
+                                                  const bool IsMainLoop) const {
+  unsigned MainTripCountA = 0;
+  unsigned EpilogueTripCountA = 0;
+  unsigned MainTripCountB = 0;
+  unsigned EpilogueTripCountB = 0;
+  APInt FixedCostDiff{128, 0, true};
+  APInt MainCostDiff{128, 0, true};
+  APInt EpilogueCostDiff{128, 0, true};
+
+  // Invalid cost is less profitable than any other cost (valid or invalid).
+  if (!A.Cost.isValid())
+    return false;
 
-  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
+  // Valid cost (A.Cost is valid at this point) is more profitable than any
+  // invalid cost.
+  if (!B.Cost.isValid()) {
+    return true;
+  }
 
-  if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
-      MaxTripCount) {
-    // If we are folding the tail and the trip count is a known (possibly small)
-    // constant, the trip count will be rounded up to an integer number of
-    // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
-    // which we compare directly. When not folding the tail, the total cost will
-    // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
-    // approximated with the per-lane cost below instead of using the tripcount
-    // as here.
-    auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
-    auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
-    return RTCostA < RTCostB;
+  if (IsMainLoop) {
+    InstructionCost RTChecksCost = 0;
+    InstructionCost FixedCostA = A.Width.isScalar() ? 0 : RTChecksCost;
+    InstructionCost FixedCostB = B.Width.isScalar() ? 0 : RTChecksCost;
+    FixedCostDiff = {128, (uint64_t)(*(FixedCostB - FixedCostA).getValue()),
+                     true};
   }
 
   // When set to preferred, for now assume vscale may be larger than 1, so
@@ -6062,14 +6102,37 @@
   // vectorization.
   if (Hints->isScalableVectorizationPreferred())
     if (A.Width.isScalable() && !B.Width.isScalable())
-      return (CostA * B.Width.getKnownMinValue()) <=
-             (CostB * A.Width.getKnownMinValue());
-
-  // To avoid the need for FP division:
-  //      (CostA / A.Width) < (CostB / B.Width)
-  // <=>  (CostA * B.Width) < (CostB * A.Width)
-  return (CostA * B.Width.getKnownMinValue()) <
-         (CostB * A.Width.getKnownMinValue());
+      FixedCostDiff += 1;
+
+  auto MaybeTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
+  unsigned ExpectedTC =
+      MaybeTC ? *MaybeTC : std::numeric_limits<unsigned>::max();
+  std::tie(MainTripCountA, EpilogueTripCountA) =
+      calculateMainEpilogTripCount(A, ExpectedTC);
+  std::tie(MainTripCountB, EpilogueTripCountB) =
+      calculateMainEpilogTripCount(B, ExpectedTC);
+
+  MainCostDiff =
+      APInt{128, (uint64_t)(*B.Cost.getValue()), true} * MainTripCountB -
+      APInt{128, (uint64_t)(*A.Cost.getValue()), true} * MainTripCountA;
+  EpilogueCostDiff =
+      APInt{128, (uint64_t)(*getScalarVF().Cost.getValue()), true} *
+      APInt{128, (uint64_t)((int64_t)EpilogueTripCountB -
+                            (int64_t)EpilogueTripCountA),
+            true};
+
+  auto ResDiff = FixedCostDiff + MainCostDiff + EpilogueCostDiff;
+  bool IsProfitable = ResDiff.sgt(0);
+
+  LLVM_DEBUG(dbgs() << "LV: Vectorization of the "
+                    << (IsMainLoop ? "main" : "epilogue") << " loop of width "
+                    << A.Width.getKnownMinValue()
+                    << (IsProfitable ? " is more" : " is less")
+                    << " profitable than of width "
+                    << B.Width.getKnownMinValue()
+                    << " by: " << (int64_t)ResDiff.getLimitedValue(64) << "\n");
+
+  return IsProfitable;
 }
 
 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
@@ -6112,7 +6175,7 @@
     // during epilogue vectorization cost modeling.
     CachedVFs.push_back(Candidate);
 
-    if (isMoreProfitable(Candidate, ChosenFactor))
+    if (isMoreProfitable(Candidate, ChosenFactor, true))
       ChosenFactor = Candidate;
   }
 
@@ -6309,7 +6372,7 @@
     if (NextVF.Width.isScalar())
       continue;
     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
-        isMoreProfitable(NextVF, Result) &&
+        isMoreProfitable(NextVF, Result, false) &&
         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
       Result = NextVF;
   }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
--- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -27,23 +27,23 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
+; CHECK-NEXT:    store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4, !llvm.access.group !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 20
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -112,7 +112,7 @@
 ; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -128,7 +128,7 @@
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -183,7 +183,7 @@
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, 16
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -200,7 +200,7 @@
 ; CHECK-NEXT:    store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;