Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1603,6 +1603,11 @@ return getTailFoldingStyle() != TailFoldingStyle::None; } + /// Returns true if current loop can delete the remainder. + bool RemoveFoldTail() const { + return CanRemoveFoldTail; + } + /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate /// or because the block in the original loop was predicated. @@ -1769,6 +1774,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool CanFoldTailByMasking = false; + /// Don't need the remainder because the trip count is multiples of VF. + bool CanRemoveFoldTail = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -3243,7 +3251,7 @@ // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. if (!Cost->requiresScalarEpilogue(VF.isVector()) && - !Cost->foldTailByMasking()) { + !(Cost->foldTailByMasking() || Cost->RemoveFoldTail())) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -5206,6 +5214,7 @@ if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); + CanRemoveFoldTail = true; return MaxFactors; } } Index: llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll @@ -31,8 +31,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -801,8 +801,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll @@ -18,8 +18,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4, 4 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -69,8 +68,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 4, 4 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] Index: llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -27,8 +27,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2 -; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[BB3:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 2, [[MIDDLE_BLOCK]] ], [ 0, [[BB1:%.*]] ] ; CHECK-NEXT: br label [[BB2:%.*]] Index: llvm/test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -234,8 +234,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -274,8 +273,7 @@ ; AUTOVF-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; AUTOVF-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTOVF: middle.block: -; AUTOVF-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256 -; AUTOVF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; AUTOVF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; AUTOVF: scalar.ph: ; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ] ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/X86/pr34438.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -29,8 +29,7 @@ ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -175,8 +175,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/create-induction-resume.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/create-induction-resume.ll +++ llvm/test/Transforms/LoopVectorize/create-induction-resume.ll @@ -53,8 +53,7 @@ ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 12, 12 -; CHECK-NEXT: br i1 [[CMP_N]], label [[L2_HEADER_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[L2_HEADER_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[L2_INNER_HEADER_PREHEADER]] ] ; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 13, [[MIDDLE_BLOCK]] ], [ 1, [[L2_INNER_HEADER_PREHEADER]] ] Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll @@ -30,8 +30,7 @@ ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1800, 1800 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1800, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -26,8 +26,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[ALIGNEDTC]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] @@ -91,8 +90,7 @@ ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] Index: llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll +++ llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll @@ -66,8 +66,7 @@ ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1024, 1024 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK: for.body: ; CHECK: for.end.loopexit: Index: llvm/test/Transforms/LoopVectorize/pr44488-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr44488-predication.ll +++ llvm/test/Transforms/LoopVectorize/pr44488-predication.ll @@ -44,8 +44,7 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 12, 12 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 111, [[MIDDLE_BLOCK]] ], [ 99, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]]