diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h
--- a/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -15,6 +15,7 @@
 #define LLVM_ANALYSIS_CODEMETRICS_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/InstructionCost.h"
 
 namespace llvm {
 class AssumptionCache;
@@ -47,14 +48,14 @@
   /// True if this function calls alloca (in the C sense).
   bool usesDynamicAlloca = false;
 
-  /// Number of instructions in the analyzed blocks.
-  unsigned NumInsts = false;
+  /// Code size cost of the analyzed blocks.
+  InstructionCost NumInsts = 0;
 
   /// Number of analyzed blocks.
   unsigned NumBlocks = false;
 
   /// Keeps track of basic block code size estimates.
-  DenseMap<const BasicBlock *, unsigned> NumBBInsts;
+  DenseMap<const BasicBlock *, InstructionCost> NumBBInsts;
 
   /// Keep track of the number of calls to 'big' functions.
   unsigned NumCalls = false;
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -117,13 +117,6 @@
     const BasicBlock *BB, const TargetTransformInfo &TTI,
     const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
   ++NumBlocks;
-  // Use a proxy variable for NumInsts of type InstructionCost, so that it can
-  // use InstructionCost's arithmetic properties such as saturation when this
-  // feature is added to InstructionCost.
-  // When storing the value back to NumInsts, we can assume all costs are Valid
-  // because the IR should not contain any nodes that cannot be costed. If that
-  // happens the cost-model is broken.
-  InstructionCost NumInstsProxy = NumInsts;
   InstructionCost NumInstsBeforeThisBB = NumInsts;
   for (const Instruction &I : *BB) {
     // Skip ephemeral values.
@@ -183,8 +176,7 @@
       if (InvI->cannotDuplicate())
         notDuplicatable = true;
 
-    NumInstsProxy += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
-    NumInsts = *NumInstsProxy.getValue();
+    NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
@@ -204,6 +196,6 @@
   notDuplicatable |= isa<IndirectBrInst>(BB->getTerminator());
 
   // Remember NumInsts for this BB.
-  InstructionCost NumInstsThisBB = NumInstsProxy - NumInstsBeforeThisBB;
-  NumBBInsts[BB] = *NumInstsThisBB.getValue();
+  InstructionCost NumInstsThisBB = NumInsts - NumInstsBeforeThisBB;
+  NumBBInsts[BB] = NumInstsThisBB;
 }
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -550,9 +550,9 @@
     // shouldn't specialize it. Set the specialization cost to Invalid.
     // Or if the lines of codes implies that this function is easy to get
     // inlined so that we shouldn't specialize it.
-    if (Metrics.notDuplicatable ||
+    if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() ||
         (!ForceFunctionSpecialization &&
-         Metrics.NumInsts < SmallFunctionThreshold)) {
+         *Metrics.NumInsts.getValue() < SmallFunctionThreshold)) {
       InstructionCost C{};
       C.setInvalid();
       return C;
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -828,6 +828,16 @@
         });
         return false;
       }
+
+      if (!Metrics.NumInsts.isValid()) {
+        LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
+                          << "instructions with invalid cost.\n");
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch)
+                 << "Contains instructions with invalid cost.";
+        });
+        return false;
+      }
     }
 
     unsigned DuplicationCost = 0;
@@ -841,7 +851,7 @@
       // using binary search, hence the LogBase2().
       unsigned CondBranches =
           APInt(32, Switch->getNumSuccessors()).ceilLogBase2();
-      DuplicationCost = Metrics.NumInsts / CondBranches;
+      DuplicationCost = *Metrics.NumInsts.getValue() / CondBranches;
     } else {
       // Compared with jump tables, the DFA optimizer removes an indirect branch
       // on each loop iteration, thus making branch prediction more precise. The
@@ -849,7 +859,7 @@
       // predictor to make a mistake, and the more benefit there is in the DFA
       // optimizer. Thus, the more branch targets there are, the lower is the
       // cost of the DFA opt.
-      DuplicationCost = Metrics.NumInsts / JumpTableSize;
+      DuplicationCost = *Metrics.NumInsts.getValue() / JumpTableSize;
     }
 
     LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block "
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -299,7 +299,11 @@
     }
     Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
   }
-  unsigned LoopSize = Metrics.NumInsts;
+
+  if (!Metrics.NumInsts.isValid())
+    return MadeChange;
+
+  unsigned LoopSize = *Metrics.NumInsts.getValue();
   if (!LoopSize)
     LoopSize = 1;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -674,7 +674,9 @@
   NotDuplicatable = Metrics.notDuplicatable;
   Convergent = Metrics.convergent;
 
-  unsigned LoopSize = Metrics.NumInsts;
+  // FIXME: This will crash for invalid InstructionCost, we should update the
+  // callers to gracefully bailout in this case.
+  unsigned LoopSize = *Metrics.NumInsts.getValue();
 
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -310,7 +310,13 @@
                    L->dump());
         return Rotated;
       }
-      if (Metrics.NumInsts > MaxHeaderSize) {
+      if (!Metrics.NumInsts.isValid()) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+                   " with invalid cost: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (*Metrics.NumInsts.getValue() > MaxHeaderSize) {
         LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
                           << Metrics.NumInsts
                           << " instructions, which is more than the threshold ("
diff --git a/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll b/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll
--- a/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll
+++ b/llvm/test/Transforms/LoopRotate/RISCV/invalid-cost.ll
@@ -76,3 +76,46 @@
   ret void
 }
 
+; This demonstrates a case where a) loop rotate needs a cost estimate to
+; know if rotation is profitable, and b) there is no cost estimate available
+; due to invalid costs in the loop.  We can't rotate this loop.
+define void @invalid_dup_required(<vscale x 1 x i8>* %p) nounwind ssp {
+; CHECK-LABEL: @invalid_dup_required(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[A:%.*]] = load <vscale x 1 x i8>, <vscale x 1 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT:    [[B:%.*]] = add <vscale x 1 x i8> [[A]], [[A]]
+; CHECK-NEXT:    store <vscale x 1 x i8> [[B]], <vscale x 1 x i8>* [[P]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a = load <vscale x 1 x i8>, <vscale x 1 x i8>* %p
+  %b = add <vscale x 1 x i8> %a, %a
+  store <vscale x 1 x i8> %b, <vscale x 1 x i8>* %p
+  %cmp = icmp slt i32 %i.0, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+
+for.body:                                         ; preds = %for.cond
+  call void @f()
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare void @f()