Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8125,8 +8125,7 @@ VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (CM.TTI.emitGetActiveLaneMask()) { - VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV}); } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -234,6 +234,10 @@ /// Get the generated Value for a given VPValue and given Part and Lane. Value *get(VPValue *Def, const VPIteration &Instance); + void setTripCount(Value *V) { TripCount = V; } + + Value *getTripCount() const { return TripCount; } + bool hasVectorValue(VPValue *Def, unsigned Part) { auto I = Data.PerPartOutput.find(Def); return I != Data.PerPartOutput.end() && Part < I->second.size() && @@ -340,6 +344,9 @@ /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). Value *CanonicalIV = nullptr; + /// Hold the original scalar trip count. + Value *TripCount = nullptr; + /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; @@ -2457,10 +2464,6 @@ /// definitions must be immutable and hold a pointer to their underlying IR. DenseMap VPExternalDefs; - /// Represents the trip count of the original loop, for folding - /// the tail. - VPValue *TripCount = nullptr; - /// Represents the backedge taken count of the original loop, for folding /// the tail. It equals TripCount - 1. VPValue *BackedgeTakenCount = nullptr; @@ -2499,8 +2502,6 @@ } for (VPValue *VPV : VPValuesToFree) delete VPV; - if (TripCount) - delete TripCount; if (BackedgeTakenCount) delete BackedgeTakenCount; for (auto &P : VPExternalDefs) @@ -2523,13 +2524,6 @@ return Entry; } - /// The trip count of the original loop. - VPValue *getOrCreateTripCount() { - if (!TripCount) - TripCount = new VPValue(); - return TripCount; - } - /// The backedge taken count of the original loop. VPValue *getOrCreateBackedgeTakenCount() { if (!BackedgeTakenCount) Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -732,7 +732,7 @@ // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. - Value *ScalarTC = State.get(getOperand(1), Part); + Value *ScalarTC = State.getTripCount(); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); auto *PredTy = VectorType::get(Int1Ty, State.VF); @@ -788,10 +788,20 @@ case VPInstruction::BranchOnCount: { if (Part != 0) break; - // First create the compare. + // First create the compare if necessary. Value *IV = State.get(getOperand(0), Part); - Value *TC = State.get(getOperand(1), Part); - Value *Cond = Builder.CreateICmpEQ(IV, TC); + Value *VTC = State.get(getOperand(1), Part); + Value *TC = State.getTripCount(); + + Value *ConstCmp = nullptr; + // When we know there will only be one vector iteration there is no need to + // create the comparison, since we already know the answer. + if (auto *C = dyn_cast(TC)) { + uint64_t TCVal = C->getZExtValue(); + if (TCVal && TCVal <= State.UF * State.VF.getKnownMinValue()) + ConstCmp = Builder.getInt1(true); + } + Value *Cond = ConstCmp ? ConstCmp : Builder.CreateICmpEQ(IV, VTC); // Now create the branch. auto *Plan = getParent()->getPlan(); @@ -898,10 +908,7 @@ Value *CanonicalIVStartValue, VPTransformState &State) { // Check if the trip count is needed, and if so build it. - if (TripCount && TripCount->getNumUsers()) { - for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) - State.set(TripCount, TripCountV, Part); - } + State.setTripCount(TripCountV); // Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll @@ -44,11 +44,7 @@ ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: {{%.*}} = call @llvm.masked.load.nxv16i8.p0nxv16i8(* {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]], poison) ; CHECK: call void @llvm.masked.store.nxv16i8.p0nxv16i8( {{%.*}}, * {{%.*}}, i32 1, [[ACTIVE_LANE_MASK]]) -; CHECK: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[VF:%.*]] = mul i64 [[VSCALE]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] -; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}} -; CHECK-NEXT: br i1 [[COND]], label %middle.block, label %vector.body +; CHECK: br i1 true, label %middle.block, label %vector.body ; entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -26,8 +26,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16** [[TMP3]] to <2 x i16*>* ; CHECK-NEXT: store <2 x i16*> , <2 x i16*>* [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 ; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll +++ llvm/test/Transforms/LoopVectorize/X86/outer_loop_test1_no_explicit_vect_width.ll @@ -72,9 +72,7 @@ ; AVX: [[ForInc]]: ; AVX: %[[IndNext]] = add nuw i64 %[[Ind]], 8 -; AVX: %[[VecIndNext]] = add <8 x i64> %[[VecInd]], -; AVX: %[[Cmp:.*]] = icmp eq i64 %[[IndNext]], 8 -; AVX: br i1 %[[Cmp]], label %middle.block, label %vector.body +; AVX: br i1 true, label %middle.block, label %vector.body @arr2 = external global [8 x i32], align 16 @arr = external global [8 x [8 x i32]], align 16 Index: llvm/test/Transforms/LoopVectorize/X86/pr34438.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -30,8 +30,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* ; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/X86/pr42674.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr42674.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr42674.ll @@ -9,26 +9,18 @@ define zeroext i8 @sum() { ; CHECK-LABEL: @sum( ; CHECK-NEXT: iter.check: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>* ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16 -; CHECK-NEXT: [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 -; CHECK: middle.block: +; CHECK-NEXT: [[TMP4:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer +; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 128 ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) -; CHECK-NEXT: ret i8 [[TMP7]] +; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]]) +; CHECK-NEXT: ret i8 [[TMP6]] ; entry: br label %for.body