diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1636,6 +1636,11 @@
     Scalars.clear();
   }
 
+  /// \return true if any of the instructions has an invalid cost. Any
+  /// such instructions are captured in \p Invalid.
+  bool hasInvalidCosts(ElementCount VF,
+                       SmallVectorImpl<Instruction *> *Invalid = nullptr);
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1675,7 +1680,9 @@
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
-  VectorizationCostTy expectedCost(ElementCount VF);
+  VectorizationCostTy
+  expectedCost(ElementCount VF,
+               SmallVectorImpl<Instruction *> *Invalid = nullptr);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
@@ -5694,8 +5701,14 @@
     auto MaxSafeUserVF =
         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
 
-    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
-      return UserVF;
+    if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
+      // If `VF=vscale x N` is safe, then so is `VF=N`
+      if (UserVF.isScalable())
+        return FixedScalableVFPair(
+            ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
+      else
+        return UserVF;
+    }
 
     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
 
@@ -6041,17 +6054,25 @@
     if (i.isScalar())
       continue;
 
-    // Notice that the vector loop needs to be executed less times, so
-    // we need to divide the cost of the vector loops by the width of
-    // the vector elements.
-    VectorizationCostTy C = expectedCost(i);
+    SmallVector<Instruction *> InvalidCosts;
+    VectorizationCostTy C = expectedCost(i, &InvalidCosts);
+    if (!C.first.isValid()) {
+      // Print an opt-report explaining why the VF is not considered.
+      std::string Message;
+      raw_string_ostream OS(Message);
+      OS << "Not vectorizing with VF=" << i
+         << " because of instructions with invalid cost:\n";
+      for (const auto *I : InvalidCosts)
+        OS << "\t" << *I << "\n";
+      OS.flush();
+      reportVectorizationInfo(Message, "InvalidCost", ORE, TheLoop,
+                              InvalidCosts[0]);
+    }
 
-    assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
     VectorizationFactor Candidate(i, C.first);
     LLVM_DEBUG(
         dbgs() << "LV: Vector loop of width " << i << " costs: "
-               << (*Candidate.Cost.getValue() /
-                   Candidate.Width.getKnownMinValue())
+               << (Candidate.Cost / Candidate.Width.getKnownMinValue())
                << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
                << ".\n");
 
@@ -6078,8 +6099,7 @@
   }
 
   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
-                 *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
-                 dbgs()
+                 ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
@@ -6407,8 +6427,9 @@
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
-    assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
-    LoopCost = *expectedCost(VF).first.getValue();
+    InstructionCost C = expectedCost(VF).first;
+    assert(C.isValid() && "Expected to have chosen a VF with valid cost");
+    LoopCost = *C.getValue();
   }
 
   assert(LoopCost && "Non-zero loop cost expected");
@@ -6850,8 +6871,14 @@
   return *Discount.getValue();
 }
 
+bool LoopVectorizationCostModel::hasInvalidCosts(
+    ElementCount VF, SmallVectorImpl<Instruction *> *Invalid) {
+  return !expectedCost(VF, Invalid).first.isValid();
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
-LoopVectorizationCostModel::expectedCost(ElementCount VF) {
+LoopVectorizationCostModel::expectedCost(
+    ElementCount VF, SmallVectorImpl<Instruction *> *Invalid) {
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6871,6 +6898,10 @@
       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
         C.first = InstructionCost(ForceTargetInstructionCost);
 
+      // Keep a list of instructions with invalid costs.
+      if (Invalid && !C.first.isValid())
+        Invalid->push_back(&I);
+
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
@@ -7264,6 +7295,8 @@
 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
                                                      ElementCount VF) const {
 
+  // There is no mechanism yet to create a scalable scalarization loop,
+  // so this is currently Invalid.
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
@@ -7982,17 +8015,21 @@
       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
   if (!UserVF.isZero() && UserVFIsLegal) {
-    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
-                      << " VF " << UserVF << ".\n");
     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
     CM.selectUserVectorizationFactor(UserVF);
-    CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(UserVF, UserVF);
-    LLVM_DEBUG(printPlans(dbgs()));
-    return {{UserVF, 0}};
+    if (!CM.hasInvalidCosts(UserVF)) {
+      LLVM_DEBUG(dbgs() << "LV: Using "
+                        << "user VF " << UserVF << ".\n");
+      CM.collectInLoopReductions();
+      buildVPlansWithVPRecipes(UserVF, UserVF);
+      LLVM_DEBUG(printPlans(dbgs()));
+      return {{UserVF, 0}};
+    } else
+      reportVectorizationInfo("UserVF ignored because of invalid costs.",
+                              "InvalidCost", ORE, OrigLoop);
   }
 
   // Populate the set of Vectorization Factor Candidates.
@@ -8758,8 +8795,6 @@
     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
-    assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
-           "Either the intrinsic cost or vector call cost must be valid");
     return UseVectorIntrinsic || !NeedToScalarize;
   };
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on \
+; RUN:     -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
 
 define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) {
 ; CHECK-LABEL: @vec_load
@@ -75,7 +77,7 @@
 ; CHECK-LABEL: @vec_intrinsic
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
 entry:
   %cmp7 = icmp sgt i64 %N, 0
   br i1 %cmp7, label %for.body, label %for.end
@@ -95,17 +97,100 @@
   ret void
 }
 
+; CHECK-REMARKS: UserVF ignored because of invalid costs.
+; CHECK-REMARKS-NEXT: Not vectorizing with VF=vscale x 1 because of instructions with invalid cost:
+; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0)
+; CHECK-REMARKS: Not vectorizing with VF=vscale x 2 because of instructions with invalid cost:
+; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0)
+define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_no_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %1 = tail call fast float @llvm.sin.f32(float %0)
+  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+  store float %1, float* %arrayidx1, align 4
+  %inc = add nuw nsw i64 %i.07, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+; CHECK-REMARKS: UserVF ignored because of invalid costs.
+; CHECK-REMARKS-NEXT: Not vectorizing with VF=vscale x 1 because of instructions with invalid cost:
+; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0)
+; CHECK-REMARKS: Not vectorizing with VF=vscale x 2 because of instructions with invalid cost:
+; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0)
+define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_fixed_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %1 = tail call fast float @llvm.sin.f32(float %0) #3
+  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+  store float %1, float* %arrayidx1, align 4
+  %inc = add nuw nsw i64 %i.07, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+; Even though there are no function mappings attached to the call
+; in the loop below we can still vectorize the loop because SVE has
+; hardware support in the form of the 'fqsrt' instruction.
+define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
+; CHECK: @vec_sqrt_no_mapping
+; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+  %0 = load float, float* %arrayidx, align 4
+  %1 = tail call fast float @llvm.sqrt.f32(float %0)
+  %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+  store float %1, float* %arrayidx1, align 4
+  %inc = add nuw nsw i64 %i.07, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
+
 declare double @foo(double)
 declare i64 @bar(i64*)
 declare double @llvm.sin.f64(double)
+declare float @llvm.sin.f32(float)
+declare float @llvm.sqrt.f32(float)
 
 declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
 declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
-declare <vscale x 2 x double> @sin_vec(<vscale x 2 x double>)
+declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>)
+declare <2 x double> @sin_vec_v2f64(<2 x double>)
 
 attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
 attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
-attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" }
+attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" }
+attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" }
 
 !1 = distinct !{!1, !2, !3}
 !2 = !{!"llvm.loop.vectorize.width", i32 2}