diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -125,6 +125,26 @@
   std::string ScalarName; /// Scalar Function Name.
   std::string VectorName; /// Vector Function Name associated to this VFInfo.
   VFISAKind ISA;          /// Instruction Set Architecture.
+
+  unsigned getParamIndexForMask() const {
+    auto MaskPos = getParamIndexForOptionalMask();
+    if (MaskPos)
+      return *MaskPos;
+
+    llvm_unreachable("Requested paramater index of non-existent mask!");
+  }
+
+  bool isMasked() const { return getParamIndexForOptionalMask().has_value(); }
+
+private:
+  Optional<unsigned> getParamIndexForOptionalMask() const {
+    unsigned ParamCount = Shape.Parameters.size();
+    for (unsigned i = 0; i < ParamCount; ++i)
+      if (Shape.Parameters[i].ParamKind == VFParamKind::GlobalPredicate)
+        return i;
+
+    return None;
+  }
 };
 
 namespace VFABI {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1107,6 +1107,21 @@
     if (isa<NoAliasScopeDeclInst>(&I))
       continue;
 
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      // Check whether we have at least one masked vector version of a scalar
+      // function.
+      bool HasMaskedVersion = false;
+
+      auto Mappings = VFDatabase::getMappings(*CI);
+      for (VFInfo Info : Mappings)
+        HasMaskedVersion |= Info.isMasked();
+
+      if (HasMaskedVersion) {
+        MaskedOp.insert(CI);
+        continue;
+      }
+    }
+
     // Loads are handled via masking (or speculated if safe to do so.)
     if (auto *LI = dyn_cast<LoadInst>(&I)) {
       if (!SafePtrs.count(LI->getPointerOperand()))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -475,7 +475,7 @@
 
   /// Widen a single call instruction within the innermost loop.
   void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
-                            VPTransformState &State);
+                            VPTransformState &State, bool MaskAvailable);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -1533,6 +1533,7 @@
   /// scalarized -
   /// i.e. either vector version isn't available, or is too expensive.
   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+                                    bool NeedsMask,
                                     bool &NeedToScalarize) const;
 
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
@@ -3401,6 +3402,7 @@
 
 InstructionCost
 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
+                                              bool NeedsMask,
                                               bool &NeedToScalarize) const {
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
@@ -3432,8 +3434,16 @@
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
   NeedToScalarize = true;
-  VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+  VFShape Shape = VFShape::get(*CI, VF, NeedsMask);
   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+  // If we want an unmasked vector function but can't find one matching the VF,
+  // and the target supports an active lane mask, maybe we can find vector
+  // function that does use a mask and synthesize an all-true mask.
+  if (!VecFunc && !NeedsMask &&
+      TTI.emitGetActiveLaneMask() != PredicationStyle::None) {
+    Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
+    VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+  }
 
   if (!TLI || CI->isNoBuiltin() || !VecFunc)
     return Cost;
@@ -4155,22 +4165,20 @@
 
 void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
                                                VPUser &ArgOperands,
-                                               VPTransformState &State) {
+                                               VPTransformState &State,
+                                               bool MaskAvailable) {
   assert(!isa<DbgInfoIntrinsic>(CI) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
   State.setDebugLocFromInst(&CI);
 
-  SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI.args())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
-
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
 
   // The flag shows whether we use Intrinsic or a usual Call for vectorized
   // version of the instruction.
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool NeedToScalarize = false;
-  InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
+  InstructionCost CallCost =
+      Cost->getVectorCallCost(&CI, VF, MaskAvailable, NeedToScalarize);
   InstructionCost IntrinsicCost =
       ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
@@ -4179,6 +4187,13 @@
   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
          "Either the intrinsic cost or vector call cost must be valid");
 
+  // If we added a mask operand in the recipe, extract it so that we can
+  // insert it in the right position for the vectorized call. The mask isn't
+  // guaranteed to be the last argument.
+  VPValue *VPMask = nullptr;
+  if (MaskAvailable)
+    VPMask = ArgOperands.removeAndReturnLastOperand();
+
   for (unsigned Part = 0; Part < UF; ++Part) {
     SmallVector<Type *, 2> TysForDecl = {CI.getType()};
     SmallVector<Value *, 4> Args;
@@ -4197,6 +4212,9 @@
     }
 
     Function *VectorF;
+    bool VectorFTakesMask = false;
+    unsigned VectorFMaskPos = 0;
+
     if (UseVectorIntrinsic) {
       // Use vector version of the intrinsic.
       if (VF.isVector())
@@ -4206,22 +4224,52 @@
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
       // Use vector version of the function call.
-      const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
+      VFShape Shape = VFShape::get(CI, VF, MaskAvailable);
+
+      VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
+
+      if (!VectorF && !MaskAvailable &&
+          TTI->emitGetActiveLaneMask() != PredicationStyle::None) {
+        Shape = VFShape::get(CI, VF, /*HasGlobalPred=*/true);
+        VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
+      }
 #ifndef NDEBUG
-      assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
-             "Can't create vector function.");
+      assert(VectorF != nullptr && "Can't create vector function.");
 #endif
-      VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
+      // Check the VFInfo for masking details
+      for (VFInfo Info : VFDatabase(CI).getMappings(CI)) {
+        if (Info.Shape == Shape) {
+            VectorFTakesMask = Info.isMasked();
+            if (VectorFTakesMask)
+            VectorFMaskPos = Info.getParamIndexForMask();
+            break;
+        }
+      }
+    }
+
+    assert((!MaskAvailable || VectorFTakesMask) &&
+           "Mask supplied for function with no mask argument");
+
+    if (VectorFTakesMask) {
+      Value *Mask = nullptr;
+      if (VPMask)
+        Mask = State.get(VPMask, Part);
+      else
+        Mask = ConstantInt::getTrue(VectorType::get(
+            IntegerType::getInt1Ty(VectorF->getFunctionType()->getContext()),
+            VF));
+      Args.insert(Args.begin() + VectorFMaskPos, Mask);
     }
-      SmallVector<OperandBundleDef, 1> OpBundles;
-      CI.getOperandBundlesAsDefs(OpBundles);
-      CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
 
-      if (isa<FPMathOperator>(V))
-        V->copyFastMathFlags(&CI);
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CI.getOperandBundlesAsDefs(OpBundles);
+    CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+    if (isa<FPMathOperator>(V))
+      V->copyFastMathFlags(&CI);
 
-      State.set(Def, V, Part);
-      State.addMetadata(V, &CI);
+    State.set(Def, V, Part);
+    State.addMetadata(V, &CI);
   }
 }
 
@@ -7284,7 +7332,8 @@
         return *RedCost;
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+    InstructionCost CallCost =
+        getVectorCallCost(CI, VF, Legal->isMaskRequired(CI), NeedToScalarize);
     if (getVectorIntrinsicIDForCall(CI, TLI)) {
       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
       return std::min(CallCost, IntrinsicCost);
@@ -8270,7 +8319,8 @@
 
 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range) const {
+                                                   VFRange &Range,
+                                                   VPlanPtr &Plan) {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [this, CI](ElementCount VF) {
@@ -8281,6 +8331,10 @@
   if (IsPredicated)
     return nullptr;
 
+  VPValue *Mask = nullptr;
+  if (Legal->isMaskRequired(CI))
+    Mask = createBlockInMask(CI->getParent(), Plan);
+
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
@@ -8295,7 +8349,8 @@
     // version of the instruction.
     // Is it beneficial to perform intrinsic call compared to lib call?
     bool NeedToScalarize = false;
-    InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+    InstructionCost CallCost = CM.getVectorCallCost(
+        CI, VF, Legal->isMaskRequired(CI), NeedToScalarize);
     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
     return UseVectorIntrinsic || !NeedToScalarize;
@@ -8305,7 +8360,7 @@
     return nullptr;
 
   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
-  return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
+  return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), Mask);
 }
 
 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
@@ -8564,7 +8619,7 @@
     return nullptr;
 
   if (auto *CI = dyn_cast<CallInst>(Instr))
-    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
@@ -9273,7 +9328,7 @@
 
 void VPWidenCallRecipe::execute(VPTransformState &State) {
   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
-                                  *this, State);
+                                  *this, State, Mask);
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -95,7 +95,7 @@
   /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
   /// decision from \p Range.Start to \p Range.End.
   VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range) const;
+                                    VFRange &Range, VPlanPtr &Plan);
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -946,12 +946,17 @@
 
 /// A recipe for widening Call instructions.
 class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
+  bool Mask;
 
 public:
   template <typename IterT>
-  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
+  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
+                    VPValue *MaskVal = nullptr)
       : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),
-        VPValue(VPValue::VPVWidenCallSC, &I, this) {}
+        VPValue(VPValue::VPVWidenCallSC, &I, this), Mask(MaskVal != nullptr) {
+    if (MaskVal)
+      addOperand(MaskVal);
+  }
 
   ~VPWidenCallRecipe() override = default;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -270,6 +270,12 @@
     Op->removeUser(*this);
   }
 
+  VPValue *removeAndReturnLastOperand() {
+    VPValue *Op = Operands.pop_back_val();
+    Op->removeUser(*this);
+    return Op;
+  }
+
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
   typedef iterator_range<operand_iterator> operand_range;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -8,10 +8,14 @@
 ; primary mask, and that without tail folding we synthesize an all-true mask.
 define void @test_widen(i64* noalias %a, i64* readnone %b) #4 {
 ; CHECK-LABEL: @test_widen(
-; LV-NOT: call <vscale x 2 x i64> @foo_vector
-; TFALWAYS-NOT: vector.body
-; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector
-; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector
+; LV: %[[LOAD:.+]] = load <vscale x 2 x i64>
+; LV: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; TFALWAYS: %[[MASK:.+]] = phi <vscale x 2 x i1>
+; TFALWAYS: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64
+; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MASK]])
+; TFFALLBACK: %[[MASK:.+]] = phi <vscale x 2 x i1>
+; TFFALLBACK: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64
+; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MASK]])
 ; CHECK: ret void
 entry:
   br label %for.body
@@ -34,9 +38,9 @@
 ; Check that a simple conditional call can be vectorized.
 define void @test_if_then(i64* noalias %a, i64* readnone %b) #4 {
 ; CHECK-LABEL: @test_if_then(
-; LV-NOT: call <vscale x 2 x i64> @foo_vector
-; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector
-; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector
+; LV: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.load, <vscale x 2 x i1> %{{.+}})
+; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %{{.+}})
+; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %{{.+}})
 ; CHECK: ret void
 entry:
   br label %for.body
@@ -70,12 +74,31 @@
 ; we just see a splat of the parameter instead. More work needed.
 define void @test_widen_if_then_else(i64* noalias %a, i64* readnone %b) #4 {
 ; CHECK-LABEL: @test_widen_if_then_else
-; LV-NOT: call <vscale x 2 x i64> @foo_vector
-; LV-NOT: call <vscale x 2 x i64> @foo_uniform
-; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector
-; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_uniform
-; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_vector
-; TFFALLBACK-NOT: call <vscale x 2 x i64> @foo_uniform
+; LV: %[[LOAD:.+]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* %{{[0-9]+}}
+; LV: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; LV: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; LV: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[INV]])
+; LV: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[CMP]])
+; LV: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[INV]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
+; LV: store <vscale x 2 x i64> %[[PPHI]]
+; TFALWAYS: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* %{{.+}}, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+; TFALWAYS: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFALWAYS: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFALWAYS: %[[MERGE1:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[INV]], <vscale x 2 x i1> zeroinitializer
+; TFALWAYS: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[MERGE1]])
+; TFALWAYS: %[[MERGE2:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[CMP]], <vscale x 2 x i1> zeroinitializer
+; TFALWAYS: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MERGE2]])
+; TFALWAYS: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[MERGE1]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
+; TFALWAYS: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> %[[PPHI]]
+; TFFALLBACK: %[[LOAD:.+]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* %{{.+}}, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
+; TFFALLBACK: %[[CMP:.+]] = icmp ugt <vscale x 2 x i64> %[[LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFFALLBACK: %[[INV:.+]] = xor <vscale x 2 x i1> %[[CMP]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFFALLBACK: %[[MERGE1:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[INV]], <vscale x 2 x i1> zeroinitializer
+; TFFALLBACK: %[[UNIFORM_SPLAT:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %[[MERGE1]])
+; TFFALLBACK: %[[MERGE2:.+]] = select <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i1> %[[CMP]], <vscale x 2 x i1> zeroinitializer
+; TFFALLBACK: %[[VECTOR:.+]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %[[LOAD]], <vscale x 2 x i1> %[[MERGE2]])
+; TFFALLBACK: %[[PPHI:.+]] = select <vscale x 2 x i1> %[[MERGE1]], <vscale x 2 x i64> %[[UNIFORM_SPLAT]], <vscale x 2 x i64> %[[VECTOR]]
+; TFFALLBACK: call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> %[[PPHI]]
 ; CHECK: ret void
 entry:
   br label %for.body
@@ -112,10 +135,9 @@
 ; unpredicated body with scalar tail can use the unmasked variant.
 define void @test_widen_nomask(i64* noalias %a, i64* readnone %b) #4 {
 ; CHECK-LABEL: @test_widen_nomask(
-; LV: call <vscale x 2 x i64> @foo_vector_nomask
-; TFALWAYS-NOT: vector.body
+; LV: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
 ; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector_nomask
-; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask
+; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
 ; CHECK: ret void
 entry:
   br label %for.body
@@ -140,10 +162,9 @@
 ; version.
 define void @test_widen_optmask(i64* noalias %a, i64* readnone %b) #4 {
 ; CHECK-LABEL: @test_widen_optmask(
-; LV: call <vscale x 2 x i64> @foo_vector_nomask
-; TFALWAYS-NOT: vector.body
-; TFALWAYS-NOT: call <vscale x 2 x i64> @foo_vector
-; TFFALLBACK: call <vscale x 2 x i64> @foo_vector_nomask
+; LV: call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> %wide.load)
+; TFALWAYS: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %active.lane.mask)
+; TFFALLBACK: call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> %wide.masked.load, <vscale x 2 x i1> %active.lane.mask)
 ; CHECK: ret void
 entry:
   br label %for.body