Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -790,6 +790,7 @@ unsigned MainLoopUF = 0; ElementCount EpilogueVF = ElementCount::getFixed(0); unsigned EpilogueUF = 0; + bool TailFoldEpilogue = false; BasicBlock *MainLoopIterationCountCheck = nullptr; BasicBlock *EpilogueIterationCountCheck = nullptr; BasicBlock *SCEVSafetyCheck = nullptr; @@ -798,8 +799,9 @@ Value *VectorTripCount = nullptr; EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, - ElementCount EVF, unsigned EUF) - : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { + ElementCount EVF, unsigned EUF, bool TFE) + : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), + TailFoldEpilogue(TFE) { assert(EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."); } @@ -5674,6 +5676,23 @@ return Result; } + // If we can fold the tail by masking to produce a predicated epilog, + // attempt to pick the scheme with the lowest cost providing it is more + // profitable than scalar. + VectorizationFactor BestFoldedVF = VectorizationFactor::Disabled(); + for (auto &VF : ProfitableVFs) { + if (VF.FoldTailByMasking && + VF.Width.isScalable() == MainLoopVF.Width.isScalable() && + ElementCount::isKnownLE(VF.Width, MainLoopVF.Width) && + (Result.Width.isScalar() || isMoreProfitable(VF, Result))) + BestFoldedVF = VF; + } + if (BestFoldedVF != VectorizationFactor::Disabled()) { + LLVM_DEBUG(dbgs() << "LEV: Vectorizing predicated epilogue loop with VF = " + << BestFoldedVF.Width << "\n";); + return BestFoldedVF; + } + if (!isEpilogueVectorizationProfitable(MainLoopVF.Width)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); @@ -7848,10 +7867,10 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF - << ", Main Loop UF:" << EPI.MainLoopUF + << "Main Loop VF:" << EPI.MainLoopVF << ", UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + << ", UF:" << EPI.EpilogueUF + << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n"; }); } @@ -7879,9 +7898,13 @@ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - Value *CheckMinIters = Builder.CreateICmp( - P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), - "min.iters.check"); + Value *CheckMinIters = + (ForEpilogue && EPI.TailFoldEpilogue) + ? Builder.getFalse() + : Builder.CreateICmp( + P, Count, + createStepForVF(Builder, Count->getType(), VFactor, UFactor), + "min.iters.check"); if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); @@ -8033,6 +8056,17 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert) { + // If we are creating a predicated epilogue loop, always jump to it. + if (EPI.TailFoldEpilogue) { + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, + ConstantInt::getFalse(Insert->getContext()))); + + LoopBypassBlocks.push_back(Insert); + return Insert; + } + assert(EPI.TripCount && "Expected trip count to have been safed in the first pass."); assert( @@ -8065,8 +8099,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" - << "Epilogue Loop VF:" << EPI.EpilogueVF - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + << "Epilogue Loop VF:" << EPI.EpilogueVF << ", UF:" << EPI.EpilogueUF + << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n"; }); } @@ -10416,8 +10450,8 @@ // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - // TODOD: Predicated remainders - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); + EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, + EpilogueVF.FoldTailByMasking); VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF, false); EpilogueVectorizerMainLoop MainILV( @@ -10431,7 +10465,8 @@ // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF, false); + VPlan &BestEpiPlan = + LVP.getBestPlanFor(EPI.EpilogueVF, EPI.TailFoldEpilogue); EpilogueVectorizerEpilogueLoop EpilogILV( L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, BestEpiPlan.getCostModel(), BFI, PSI, Checks); @@ -10444,7 +10479,8 @@ // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { - if (isa(&R)) + if (isa(&R) || + isa(&R)) continue; Value *ResumeV = nullptr; Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -644,7 +644,8 @@ assert(all_of(IV->users(), [](const VPUser *U) { if (isa(U) || - isa(U)) + isa(U) || + isa(U)) return true; auto *VPI = cast(U); return VPI->getOpcode() == Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -11,12 +11,12 @@ ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16' ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1 +; DEBUG: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:vscale x 8, UF:1 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16' ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG-FORCED: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:8, UF:1 define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-LABEL: @main_vf_vscale_x_16( @@ -188,12 +188,12 @@ ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2' ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_2' ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG-FORCED: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1 ; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because ; that's the minimum supported VF by SVE), we could still use a wide Index: llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll @@ -8,10 +8,12 @@ ; CHECK-LABEL: @raddshift2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -39,18 +41,50 @@ ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i16 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP29]] to i16 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 2 ; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ADD3]], 2 @@ -59,7 +93,7 @@ ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: @@ -103,10 +137,12 @@ ; CHECK-LABEL: @rhadd( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -131,21 +167,53 @@ ; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i16 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP29]] to i16 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 1 ; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ADD3]], 1 @@ -154,7 +222,7 @@ ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: @@ -207,10 +275,12 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[CONV2]], [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ] ; CHECK-NEXT: [[LAG_032:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] ; CHECK-NEXT: [[CMP428:%.*]] = icmp slt i32 [[LAG_032]], [[CONV2]] -; CHECK-NEXT: br i1 [[CMP428]], label [[FOR_BODY6_PREHEADER:%.*]], label [[FOR_END]] -; CHECK: for.body6.preheader: +; CHECK-NEXT: br i1 [[CMP428]], label [[ITER_CHECK:%.*]], label [[FOR_END]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[INDVARS_IV]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[INDVARS_IV]], [[N_MOD_VF]] @@ -236,37 +306,72 @@ ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[INDVARS_IV]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY6_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[CONV1027]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP13]], i32 [[INDVARS_IV]]) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP15]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i32 [[TMP13]], [[LAG_032]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP19]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) +; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD6]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = ashr <4 x i32> [[TMP21]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP22]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]]) +; CHECK-NEXT: [[TMP25]] = add i32 [[TMP24]], [[VEC_PHI5]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add i32 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT9]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK: for.body6: -; CHECK-NEXT: [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[I_029]] -; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP27]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[I_029]], [[LAG_032]] ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[ADD]] -; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2 -; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP28]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]] ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], [[CONV1027]] ; CHECK-NEXT: [[ADD11]] = add nsw i32 [[SHR]], [[ACCUMULATOR_030]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_029]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[ADD11_LCSSA]], [[FOR_END_LOOPEXIT]] ] -; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16 -; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP29]] to i16 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[AUTOCORRDATA:%.*]], i32 [[LAG_032]] ; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2 ; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[LAG_032]], 1 Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll @@ -6,7 +6,7 @@ ; Currently we cannot handle scalable vectorization factors. ; CHECK: LV: Checking a loop in 'f1' ; CHECK: LEV: Epilogue vectorization factor is forced. -; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1 +; CHECK: Epilogue Loop VF:2, UF:1 define void @f1(ptr %A) { entry: