diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -36,6 +36,7 @@
 class PredicatedScalarEvolution;
 class LoopVectorizationRequirements;
 class LoopVectorizeHints;
+class GeneratedRTChecks;
 class OptimizationRemarkEmitter;
 class VPRecipeBuilder;
 
@@ -255,7 +256,8 @@
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
+  Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC,
+                                     GeneratedRTChecks &Checks);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -335,6 +335,11 @@
     cl::desc(
         "Prefer predicating a reduction operation over an after loop select."));
 
+static cl::opt<float> RuntimeCheckOverheadFraction(
+    "lv-runtime-check-overhead-fraction", cl::init(0.005), cl::Hidden,
+    cl::desc("The maximum fraction of the allowed overhead runtime checks can "
+             "add compared to the runtime of the loop."));
+
 cl::opt<bool> EnableVPlanNativePath(
     "enable-vplan-native-path", cl::init(false), cl::Hidden,
     cl::desc("Enable VPlan-native vectorization path with "
@@ -430,10 +435,10 @@
   return None;
 }
 
+namespace llvm {
 // Forward declare GeneratedRTChecks.
 class GeneratedRTChecks;
 
-namespace llvm {
 
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
@@ -1606,9 +1611,6 @@
     Scalars.clear();
   }
 
-private:
-  unsigned NumPredStores = 0;
-
   /// \return An upper bound for the vectorization factor, a power-of-2 larger
   /// than zero. One is returned if vectorization should best be avoided due
   /// to cost.
@@ -1624,16 +1626,21 @@
   /// actually taken place).
   using VectorizationCostTy = std::pair<InstructionCost, bool>;
 
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+
+  float ScalarCost;
+
+private:
+  unsigned NumPredStores = 0;
+
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
   /// vector widths. The cost that is returned is *not* normalized by
   /// the factor width.
   VectorizationCostTy expectedCost(ElementCount VF);
 
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
@@ -1848,7 +1855,6 @@
   /// Profitable vector factors.
   SmallVector<VectorizationFactor, 8> ProfitableVFs;
 };
-} // end namespace llvm
 
 /// Helper struct to manage generating runtime checks for vectorization.
 ///
@@ -1953,6 +1959,25 @@
     }
   }
 
+  unsigned getCost(LoopVectorizationCostModel &CM) {
+    unsigned RTCheckCost = 0;
+    if (SCEVCheckBlock)
+      for (Instruction &I : *SCEVCheckBlock) {
+        if (SCEVCheckBlock->getTerminator() == &I)
+          continue;
+        RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1))
+                            .first.getValue();
+      }
+    if (MemCheckBlock)
+      for (Instruction &I : *MemCheckBlock) {
+        if (MemCheckBlock->getTerminator() == &I)
+          continue;
+        RTCheckCost += *CM.getInstructionCost(&I, ElementCount::getFixed(1))
+                            .first.getValue();
+      }
+    return RTCheckCost;
+  }
+
   /// Remove the created SCEV & memory runtime check blocks & instructions, if
   /// unused.
   ~GeneratedRTChecks() {
@@ -2051,6 +2076,7 @@
     return MemCheckBlock;
   }
 };
+} // end namespace llvm
 
 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
 // vectorization. The loop needs to be annotated with #pragma omp simd
@@ -3207,7 +3233,6 @@
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
-
   BasicBlock *const SCEVCheckBlock =
       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
   if (!SCEVCheckBlock)
@@ -5847,7 +5872,7 @@
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
 
   auto Width = ElementCount::getFixed(1);
-  const float ScalarCost = *ExpectedCost.getValue();
+  ScalarCost = *ExpectedCost.getValue();
   float Cost = ScalarCost;
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -7700,7 +7725,8 @@
 }
 
 Optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
+                               GeneratedRTChecks &Checks) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
@@ -7740,6 +7766,8 @@
     CM.collectInLoopReductions();
     buildVPlansWithVPRecipes(VF, VF);
     LLVM_DEBUG(printPlans(dbgs()));
+
+    Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate());
     return {{VF, 0}};
   }
 
@@ -7767,24 +7795,41 @@
   // Select the optimal vectorization factor.
   auto SelectedVF = CM.selectVectorizationFactor(MaxVF);
 
+  if (!SelectedVF.Width.isScalar())
+    Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate());
+
   // Check if it is profitable to vectorize with runtime checks.
   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
-    bool PragmaThresholdReached =
-        NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-    bool ThresholdReached =
-        NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
-    if ((ThresholdReached && !Hints.allowReordering()) ||
-        PragmaThresholdReached) {
-      ORE->emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps",
-                                        OrigLoop->getStartLoc(),
-                                        OrigLoop->getHeader())
-               << "loop not vectorized: cannot prove it is safe to reorder "
-                  "memory operations";
-      });
-      LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
-      return VectorizationFactor::Disabled();
+    bool CanIgnoreRTThreshold = false;
+    if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) {
+      unsigned RTCost = Checks.getCost(CM);
+      // If the expected cost of the runtime checks is a small fraction of the
+      // expected cost of the scalar loop, we can be more aggressive with
+      // using runtime checks.
+      CanIgnoreRTThreshold =
+          RTCost < (*ExpectedTC * CM.ScalarCost * RuntimeCheckOverheadFraction);
+      LLVM_DEBUG(dbgs() << "LV: Cost of runtime check: " << RTCost << " "
+                        << *ExpectedTC * CM.ScalarCost << "\n");
+    }
+
+    if (!CanIgnoreRTThreshold) {
+      bool PragmaThresholdReached =
+          NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+      bool ThresholdReached = NumRuntimePointerChecks >
+                              VectorizerParams::RuntimeMemoryCheckThreshold;
+      if ((ThresholdReached && !Hints.allowReordering()) ||
+          PragmaThresholdReached) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "CantReorderMemOps",
+                                          OrigLoop->getStartLoc(),
+                                          OrigLoop->getHeader())
+                 << "loop not vectorized: cannot prove it is safe to reorder "
+                    "memory operations";
+        });
+        LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+        return None;
+      }
     }
   }
   return SelectedVF;
@@ -9648,8 +9693,10 @@
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
+                           F->getParent()->getDataLayout());
   // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC, Checks);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
@@ -9745,13 +9792,6 @@
   bool DisableRuntimeUnroll = false;
   MDNode *OrigLoopID = L->getLoopID();
   {
-    // Optimistically generate runtime checks. Drop them if they turn out to not
-    // be profitable. Limit the scope of Checks, so the cleanup happens
-    // immediately after vector codegeneration is done.
-    GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
-                             F->getParent()->getDataLayout());
-    if (!VF.Width.isScalar() || IC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
     LVP.setBestPlan(VF.Width, IC);
 
     using namespace ore;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-size-based-threshold.ll
@@ -1,14 +1,19 @@
-; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck %s
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=DEFAULT %s
+; RUN: opt -loop-vectorize -lv-runtime-check-overhead-fraction=0.5 -mtriple=arm64-apple-iphoneos -S %s | FileCheck --check-prefix=CHECK --check-prefix=CUSTOM %s
 
 ; Tests for loops with large numbers of runtime checks. Check that loops are
 ; vectorized, if the loop trip counts are large and the impact of the runtime
 ; checks is very small compared to the expected loop runtimes.
 
 
-; The trip count in the loop in this function is too to warrant large runtime checks.
+; The trip count in the loop in this function is too small to warrant large
+; runtime checks with the default threshold. It should be vectorized with
+; a larger custom threshold.
 ; CHECK-LABEL: define {{.*}} @test_tc_too_small
-; CHECK-NOT: vector.memcheck
-; CHECK-NOT: vector.body
+; DEFAULT-NOT: vector.memcheck
+; DEFAULT-NOT: vector.body
+; CUSTOM:      vector.memcheck
+; CUSTOM:      vector.body
 define void @test_tc_too_small(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) {
 entry:
   br label %loop
@@ -57,11 +62,11 @@
   ret void
 }
 
-; FIXME
-; The trip count in the loop in this function high enough to warrant large runtime checks.
+; The trip count in the loop in this function high enough to warrant large
+; runtime checks.
 ; CHECK-LABEL: define {{.*}} @test_tc_big_enough
-; CHECK-NOT: vector.memcheck
-; CHECK-NOT: vector.body
+; CHECK: vector.memcheck
+; CHECK: vector.body
 define void @test_tc_big_enough(i16* %ptr.1, i16* %ptr.2, i16* %ptr.3, i16* %ptr.4, i64 %off.1, i64 %off.2) {
 entry:
   br label %loop