Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8125,8 +8125,7 @@
     VPBuilder::InsertPointGuard Guard(Builder);
     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
     if (CM.TTI.emitGetActiveLaneMask()) {
-      VPValue *TC = Plan->getOrCreateTripCount();
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
     } else {
       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -234,6 +234,10 @@
   /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance);
 
+  void setTripCount(Value *V) { TripCount = V; }
+
+  Value *getTripCount() const { return TripCount; }
+
   bool hasVectorValue(VPValue *Def, unsigned Part) {
     auto I = Data.PerPartOutput.find(Def);
     return I != Data.PerPartOutput.end() && Part < I->second.size() &&
@@ -340,6 +344,9 @@
   /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
   Value *CanonicalIV = nullptr;
 
+  /// Hold the original scalar trip count.
+  Value *TripCount = nullptr;
+
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -2457,10 +2464,6 @@
   /// definitions must be immutable and hold a pointer to their underlying IR.
   DenseMap<Value *, VPValue *> VPExternalDefs;
 
-  /// Represents the trip count of the original loop, for folding
-  /// the tail.
-  VPValue *TripCount = nullptr;
-
   /// Represents the backedge taken count of the original loop, for folding
   /// the tail. It equals TripCount - 1.
   VPValue *BackedgeTakenCount = nullptr;
@@ -2499,8 +2502,6 @@
     }
     for (VPValue *VPV : VPValuesToFree)
       delete VPV;
-    if (TripCount)
-      delete TripCount;
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
     for (auto &P : VPExternalDefs)
@@ -2523,13 +2524,6 @@
     return Entry;
   }
 
-  /// The trip count of the original loop.
-  VPValue *getOrCreateTripCount() {
-    if (!TripCount)
-      TripCount = new VPValue();
-    return TripCount;
-  }
-
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -732,7 +732,7 @@
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
     // Get the original loop tripcount.
-    Value *ScalarTC = State.get(getOperand(1), Part);
+    Value *ScalarTC = State.getTripCount();
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
     auto *PredTy = VectorType::get(Int1Ty, State.VF);
@@ -788,10 +788,20 @@
   case VPInstruction::BranchOnCount: {
     if (Part != 0)
       break;
-    // First create the compare.
+    // First create the compare if necessary.
     Value *IV = State.get(getOperand(0), Part);
-    Value *TC = State.get(getOperand(1), Part);
-    Value *Cond = Builder.CreateICmpEQ(IV, TC);
+    Value *VTC = State.get(getOperand(1), Part);
+    Value *TC = State.getTripCount();
+
+    Value *ConstCmp = nullptr;
+    // When we know there will only be one vector iteration there is no need to
+    // create the comparison, since we already know the answer.
+    if (auto *C = dyn_cast<ConstantInt>(TC)) {
+      uint64_t TCVal = C->getZExtValue();
+      if (TCVal && TCVal <= State.UF * State.VF.getKnownMinValue())
+        ConstCmp = Builder.getInt1(true);
+    }
+    Value *Cond = ConstCmp ? ConstCmp : Builder.CreateICmpEQ(IV, VTC);
 
     // Now create the branch.
     auto *Plan = getParent()->getPlan();
@@ -898,10 +908,7 @@
                              Value *CanonicalIVStartValue,
                              VPTransformState &State) {
   // Check if the trip count is needed, and if so build it.
-  if (TripCount && TripCount->getNumUsers()) {
-    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
-      State.set(TripCount, TripCountV, Part);
-  }
+  State.setTripCount(TripCountV);
 
   // Check if the backedge taken count is needed, and if so build it.
   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -44,11 +44,7 @@
 ; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
 ; CHECK:         call void @llvm.masked.store.nxv16i8.p0nxv16i8(<vscale x 16 x i8> {{%.*}}, <vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}}
-; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+; CHECK:         br i1 true, label %middle.block, label %vector.body
 ;
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
+++ llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll
@@ -26,8 +26,7 @@
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>*
 ; CHECK-NEXT:    store <2 x i16*> <i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0), i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2, 2
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
Index: llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
+++ llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll
@@ -72,9 +72,7 @@
 
 ; AVX: [[ForInc]]:
 ; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8
-; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8
-; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body
+; AVX: br i1 true, label %middle.block, label %vector.body
 
 @arr2 = external global [8 x i32], align 16
 @arr = external global [8 x [8 x i32]], align 16
Index: llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
+++ llvm/test/Transforms/LoopVectorize/X86/pr34438.ll
@@ -30,8 +30,7 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
 ; CHECK-NEXT:    store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 8, 8
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
Index: llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
+++ llvm/test/Transforms/LoopVectorize/X86/pr42674.ll
@@ -9,26 +9,18 @@
 define zeroext i8 @sum() {
 ; CHECK-LABEL: @sum(
 ; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16
-; CHECK-NEXT:    [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
-; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add nuw i64 0, 128
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
-; CHECK-NEXT:    ret i8 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
+; CHECK-NEXT:    ret i8 [[TMP6]]
 ;
 entry:
   br label %for.body