diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -330,6 +330,12 @@
   /// to be vectorized.
   bool blockNeedsPredication(BasicBlock *BB) const;
 
+  /// Return true if the instruction requires a predicated vector operation
+  /// when widening \p I to a vector. Such instructions include conditional
+  /// stores and instructions that may divide by zero.
+  bool requiresPredicatedWidening(Instruction *I,
+                                  bool VFIterationIsPredicated) const;
+
   /// Check if this pointer is consecutive when vectorizing. This happens
   /// when the last index of the GEP is the induction variable, or that the
   /// pointer itself is an induction variable.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1322,4 +1322,49 @@
   return true;
 }
 
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+  assert((I.getOpcode() == Instruction::UDiv ||
+          I.getOpcode() == Instruction::SDiv ||
+          I.getOpcode() == Instruction::URem ||
+          I.getOpcode() == Instruction::SRem) &&
+         "Unexpected instruction");
+  Value *Divisor = I.getOperand(1);
+  auto *CInt = dyn_cast<ConstantInt>(Divisor);
+  return !CInt || CInt->isZero();
+}
+
+bool LoopVectorizationLegality::requiresPredicatedWidening(
+    Instruction *I, bool VFIterationIsPredicated) const {
+  // If no predication is used, no predicatation is needed for the widened op.
+  if (!blockNeedsPredication(I->getParent()) && !VFIterationIsPredicated)
+    return false;
+
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Load:
+  case Instruction::Store:
+    // Some instructions can be speculated, even when predication is used
+    // for the block.
+    return isMaskRequired(I);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // If predication is used for this block and the operation would otherwise
+    // be guarded, then this requires a predicated vector operation.
+    return mayDivideByZero(*I);
+  }
+
+  // By default, all operations can be widened safely without predication.
+  return false;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1511,25 +1511,15 @@
     }));
   }
 
-  /// Returns true if \p I is an instruction that will be scalarized with
-  /// predication. Such instructions include conditional stores and
-  /// instructions that may divide by zero.
-  /// If a non-zero VF has been calculated, we check if I will be scalarized
-  /// predication for that VF.
-  bool isScalarWithPredication(Instruction *I) const;
-
-  // Returns true if \p I is an instruction that will be predicated either
-  // through scalar predication or masked load/store or masked gather/scatter.
-  // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) {
-    if (!blockNeedsPredication(I->getParent()))
-      return false;
-    // Loads and stores that need some form of masked operation are predicated
-    // instructions.
-    if (isa<LoadInst>(I) || isa<StoreInst>(I))
-      return Legal->isMaskRequired(I);
-    return isScalarWithPredication(I);
-  }
+  /// Returns true if \p I is an instruction that requires predicated widening,
+  /// and cannot use predicated vector ops to implement the widened operations,
+  /// thus falling back on scalarized operations.
+  /// If \p VFIterationIsPredicated is true, then the vector iteration will use
+  /// predication to enable only the active lanes handled in the VF iteration,
+  /// even if the block is not predicated in the original scalar loop (e.g. for
+  /// tail folding).
+  bool isScalarWithPredication(Instruction *I,
+                               bool VFIterationIsPredicated) const;
 
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
@@ -1576,10 +1566,6 @@
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const { return FoldTailByMasking; }
 
-  bool blockNeedsPredication(BasicBlock *BB) const {
-    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
-  }
-
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -4926,24 +4912,6 @@
   }
 }
 
-/// A helper function for checking whether an integer division-related
-/// instruction may divide by zero (in which case it must be predicated if
-/// executed conditionally in the scalar code).
-/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
-/// Non-zero divisors that are non compile-time constants will not be
-/// converted into multiplication, so we will still end up scalarizing
-/// the division, but can do so w/o predication.
-static bool mayDivideByZero(Instruction &I) {
-  assert((I.getOpcode() == Instruction::UDiv ||
-          I.getOpcode() == Instruction::SDiv ||
-          I.getOpcode() == Instruction::URem ||
-          I.getOpcode() == Instruction::SRem) &&
-         "Unexpected instruction");
-  Value *Divisor = I.getOperand(1);
-  auto *CInt = dyn_cast<ConstantInt>(Divisor);
-  return !CInt || CInt->isZero();
-}
-
 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
                                            VPUser &User,
                                            VPTransformState &State) {
@@ -5334,31 +5302,23 @@
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
-  if (!blockNeedsPredication(I->getParent()))
+bool LoopVectorizationCostModel::isScalarWithPredication(
+    Instruction *I, bool VFIterationIsPredicated) const {
+  if (!Legal->requiresPredicatedWidening(I, VFIterationIsPredicated))
     return false;
-  switch(I->getOpcode()) {
-  default:
-    break;
-  case Instruction::Load:
-  case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
-      return false;
-    auto *Ptr = getLoadStorePointerOperand(I);
-    auto *Ty = getLoadStoreType(I);
-    const Align Alignment = getLoadStoreAlignment(I);
-    return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
-                                TTI.isLegalMaskedGather(Ty, Alignment))
-                            : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
-                                TTI.isLegalMaskedScatter(Ty, Alignment));
-  }
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::SRem:
-  case Instruction::URem:
-    return mayDivideByZero(*I);
-  }
-  return false;
+
+  // Until the LV adds support for using llvm.vp intrinsics to handle
+  // these operations, fall back on predicated scalar operations.
+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+    return true;
+
+  auto *Ptr = getLoadStorePointerOperand(I);
+  auto *Ty = getLoadStoreType(I);
+  const Align Alignment = getLoadStoreAlignment(I);
+  return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
+                              TTI.isLegalMaskedGather(Ty, Alignment))
+                          : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
+                              TTI.isLegalMaskedScatter(Ty, Alignment));
 }
 
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
@@ -5413,7 +5373,7 @@
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
-  if (isScalarWithPredication(I))
+  if (isScalarWithPredication(I, foldTailByMasking()))
     return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
@@ -5463,7 +5423,7 @@
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I)) {
+    if (isScalarWithPredication(I, foldTailByMasking())) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
       return;
@@ -6703,7 +6663,7 @@
   // from moving "masked load/store" check from legality to cost model.
   // Masked Load/Gather emulation was previously never allowed.
   // Limited number of Masked Store/Scatter emulation was allowed.
-  assert(isPredicatedInst(I) &&
+  assert(Legal->requiresPredicatedWidening(I, foldTailByMasking()) &&
          "Expecting a scalar emulated instruction");
   return isa<LoadInst>(I) ||
          (isa<StoreInst>(I) &&
@@ -6728,10 +6688,10 @@
   // determine if it would be better to not if-convert the blocks they are in.
   // If so, we also record the instructions to scalarize.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockNeedsPredication(BB))
+    if (!Legal->blockNeedsPredication(BB) && !foldTailByMasking())
       continue;
     for (Instruction &I : *BB)
-      if (isScalarWithPredication(&I)) {
+      if (isScalarWithPredication(&I, foldTailByMasking())) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount logic if hacked cost is needed
         // for emulated masked memrefs.
@@ -6771,7 +6731,7 @@
 
     // If the instruction is scalar with predication, it will be analyzed
     // separately. We ignore it within the context of PredInst.
-    if (isScalarWithPredication(I))
+    if (isScalarWithPredication(I, foldTailByMasking()))
       return false;
 
     // If any of the instruction's operands are uniform after vectorization,
@@ -6819,7 +6779,8 @@
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I, foldTailByMasking()) &&
+        !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
@@ -6976,7 +6937,7 @@
   // If we have a predicated load/store, it will need extra i1 extracts and
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
-  if (isPredicatedInst(I)) {
+  if (Legal->requiresPredicatedWidening(I, foldTailByMasking())) {
     Cost /= getReciprocalPredBlockProb();
 
     // Add the cost of an i1 extract and a branch
@@ -7323,7 +7284,8 @@
       // predicated uniform stores. Today they are treated as any other
       // predicated store (see added test cases in
       // invariant-store-vectorization.ll).
-      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+      if (isa<StoreInst>(&I) &&
+          isScalarWithPredication(&I, foldTailByMasking()))
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I)) {
@@ -7570,7 +7532,7 @@
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF.isVector() && isScalarWithPredication(I)) {
+    if (VF.isVector() && isScalarWithPredication(I, foldTailByMasking())) {
       InstructionCost Cost = 0;
 
       // These instructions have a non-void type, so account for the phi nodes
@@ -7965,7 +7927,8 @@
     return None;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
-  if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+  if ((Legal->blockNeedsPredication(OrigLoop->getHeader()) ||
+       CM.foldTailByMasking()) &&
       !useMaskedInterleavedAccesses(*TTI)) {
     LLVM_DEBUG(
         dbgs()
@@ -8563,7 +8526,7 @@
   VPValue *BlockMask = nullptr;
 
   if (OrigLoop->getHeader() == BB) {
-    if (!CM.blockNeedsPredication(BB))
+    if (!Legal->blockNeedsPredication(BB) && !CM.foldTailByMasking())
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
     // Create the block in mask as the first non-phi instruction in the block.
@@ -8735,7 +8698,9 @@
                                                    VFRange &Range) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+      [this, CI](ElementCount VF) {
+        return CM.isScalarWithPredication(CI, CM.foldTailByMasking());
+      },
       Range);
 
   if (IsPredicated)
@@ -8777,7 +8742,8 @@
   // scalarization is profitable or it is predicated.
   auto WillScalarize = [this, I](ElementCount VF) -> bool {
     return CM.isScalarAfterVectorization(I, VF) ||
-           CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+           CM.isProfitableToScalarize(I, VF) ||
+           CM.isScalarWithPredication(I, CM.foldTailByMasking());
   };
   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
                                                              Range);
@@ -8851,7 +8817,10 @@
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
+      [&](ElementCount VF) {
+        return Legal->requiresPredicatedWidening(I, CM.foldTailByMasking());
+      },
+      Range);
 
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);