diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1532,25 +1532,20 @@
     }));
   }
 
-  /// Returns true if \p I is an instruction that will be scalarized with
-  /// predication. Such instructions include conditional stores and
-  /// instructions that may divide by zero.
-  /// If a non-zero VF has been calculated, we check if I will be scalarized
-  /// predication for that VF.
-  bool isScalarWithPredication(Instruction *I) const;
-
-  // Returns true if \p I is an instruction that will be predicated either
-  // through scalar predication or masked load/store or masked gather/scatter.
-  // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) {
-    if (!Legal->blockNeedsPredication(I->getParent()) && !foldTailByMasking())
-      return false;
-    // Loads and stores that need some form of masked operation are predicated
-    // instructions.
-    if (isa<LoadInst>(I) || isa<StoreInst>(I))
-      return Legal->isMaskRequired(I);
-    return isScalarWithPredication(I);
-  }
+  /// Returns true if \p I is an instruction that requires predication, but
+  /// cannot use predicated vector ops to implement the widened operations, thus
+  /// having to fall back on scalarized operations.
+  /// If \p VFIterationIsPredicated is true, then the vector iteration will use
+  /// predication to enable only the active lanes handled in the VF iteration,
+  /// even if the block is not predicated in the original scalar loop (e.g. for
+  /// tail folding).
+  bool isScalarWithPredication(Instruction *I,
+                               bool VFIterationIsPredicated) const;
+
+  /// Return true if the instruction requires a predicated vector operation
+  /// when widening \p I to a vector. Such instructions include conditional
+  /// stores and instructions that may divide by zero.
+  bool requiresPredication(Instruction *I, bool VFIterationIsPredicated) const;
 
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
@@ -5251,16 +5246,40 @@
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
-  if (!Legal->blockNeedsPredication(I->getParent()) && !foldTailByMasking())
+bool LoopVectorizationCostModel::requiresPredication(
+    Instruction *I, bool VFIterationIsPredicated) const {
+  // If no predication is used, no predicatation is needed for the widened op.
+  if (!Legal->blockNeedsPredication(I->getParent()) && !VFIterationIsPredicated)
     return false;
-  switch(I->getOpcode()) {
+
+  switch (I->getOpcode()) {
   default:
     break;
   case Instruction::Load:
-  case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
-      return false;
+  case Instruction::Store:
+    // Some instructions can be speculated, even when predication is used
+    // for the block.
+    return Legal->isMaskRequired(I);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // If predication is used for this block and the operation would otherwise
+    // be guarded, then this requires a predicated vector operation.
+    return mayDivideByZero(*I);
+  }
+
+  // By default, all operations can be widened safely without predication.
+  return false;
+}
+
+bool LoopVectorizationCostModel::isScalarWithPredication(
+    Instruction *I, bool VFIterationIsPredicated) const {
+  if (!requiresPredication(I, VFIterationIsPredicated))
+    return false;
+
+  // See if masked.load/store instructions are legal.
+  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
     auto *Ptr = getLoadStorePointerOperand(I);
     auto *Ty = getLoadStoreType(I);
     const Align Alignment = getLoadStoreAlignment(I);
@@ -5269,13 +5288,10 @@
                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
                                 TTI.isLegalMaskedScatter(Ty, Alignment));
   }
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-    return mayDivideByZero(*I);
-  }
-  return false;
+
+  // Until the LV adds support for using llvm.vp intrinsics to handle
+  // these operations, fall back on predicated scalar operations.
+  return true;
 }
 
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
@@ -5340,7 +5356,7 @@
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
-  if (isScalarWithPredication(I))
+  if (isScalarWithPredication(I, foldTailByMasking()))
     return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
@@ -5391,7 +5407,7 @@
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I)) {
+    if (isScalarWithPredication(I, foldTailByMasking())) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
       return;
@@ -6799,7 +6815,7 @@
   // from moving "masked load/store" check from legality to cost model.
   // Masked Load/Gather emulation was previously never allowed.
   // Limited number of Masked Store/Scatter emulation was allowed.
-  assert(isPredicatedInst(I) &&
+  assert(requiresPredication(I, foldTailByMasking()) &&
          "Expecting a scalar emulated instruction");
   return isa<LoadInst>(I) ||
          (isa<StoreInst>(I) &&
@@ -6827,7 +6843,7 @@
     if (!Legal->blockNeedsPredication(BB) && !foldTailByMasking())
       continue;
     for (Instruction &I : *BB)
-      if (isScalarWithPredication(&I)) {
+      if (isScalarWithPredication(&I, foldTailByMasking())) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount if scalable, because that would lead to
         // invalid scalarization costs.
@@ -6869,7 +6885,7 @@
 
     // If the instruction is scalar with predication, it will be analyzed
     // separately. We ignore it within the context of PredInst.
-    if (isScalarWithPredication(I))
+    if (isScalarWithPredication(I, foldTailByMasking()))
       return false;
 
     // If any of the instruction's operands are uniform after vectorization,
@@ -6916,7 +6932,8 @@
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I, foldTailByMasking()) &&
+        !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
           APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -7077,7 +7094,7 @@
   // If we have a predicated load/store, it will need extra i1 extracts and
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
-  if (isPredicatedInst(I)) {
+  if (requiresPredication(I, foldTailByMasking())) {
     Cost /= getReciprocalPredBlockProb();
 
     // Add the cost of an i1 extract and a branch
@@ -7465,7 +7482,8 @@
       // predicated uniform stores. Today they are treated as any other
       // predicated store (see added test cases in
       // invariant-store-vectorization.ll).
-      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+      if (isa<StoreInst>(&I) &&
+          isScalarWithPredication(&I, foldTailByMasking()))
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I)) {
@@ -7720,7 +7738,7 @@
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF.isVector() && isScalarWithPredication(I)) {
+    if (VF.isVector() && isScalarWithPredication(I, foldTailByMasking())) {
       InstructionCost Cost = 0;
 
       // These instructions have a non-void type, so account for the phi nodes
@@ -8906,7 +8924,9 @@
                                                    VFRange &Range) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+      [this, CI](ElementCount VF) {
+        return CM.isScalarWithPredication(CI, CM.foldTailByMasking());
+      },
       Range);
 
   if (IsPredicated)
@@ -8946,7 +8966,8 @@
   // scalarization is profitable or it is predicated.
   auto WillScalarize = [this, I](ElementCount VF) -> bool {
     return CM.isScalarAfterVectorization(I, VF) ||
-           CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+           CM.isProfitableToScalarize(I, VF) ||
+           CM.isScalarWithPredication(I, CM.foldTailByMasking());
   };
   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
                                                              Range);
@@ -9020,7 +9041,10 @@
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+      [&](ElementCount VF) {
+        return CM.requiresPredication(I, CM.foldTailByMasking());
+      },
+      Range);
 
   // Even if the instruction is not marked as uniform, there are certain
   // intrinsic calls that can be effectively treated as such, so we check for
@@ -9067,7 +9091,7 @@
       continue;
     auto *RepR =
         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
-    assert(RepR->isPredicated() &&
+    assert(RepR->requiresPredication() &&
            "expected Replicate recipe to be predicated");
     RepR->setAlsoPack(false);
   }