Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -795,6 +795,7 @@ unsigned MainLoopUF = 0; ElementCount EpilogueVF = ElementCount::getFixed(0); unsigned EpilogueUF = 0; + bool TailFoldEpilogue = false; BasicBlock *MainLoopIterationCountCheck = nullptr; BasicBlock *EpilogueIterationCountCheck = nullptr; BasicBlock *SCEVSafetyCheck = nullptr; @@ -803,8 +804,9 @@ Value *VectorTripCount = nullptr; EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, - ElementCount EVF, unsigned EUF) - : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) { + ElementCount EVF, unsigned EUF, bool TFE) + : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF), + TailFoldEpilogue(TFE) { assert(EUF == 1 && "A high UF for the epilogue loop is likely not beneficial."); } @@ -1225,7 +1227,7 @@ selectVectorizationFactor(const ElementCountSet &CandidateVFs); VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, + selectEpilogueVectorizationFactor(const VectorizationScheme MaxVF, const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. @@ -1895,7 +1897,8 @@ /// Returns true if epilogue vectorization is considered profitable, and /// false otherwise. /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; + bool + isUnpredicatedEpilogueVectorizationProfitable(const ElementCount VF) const; public: /// The loop that we evaluate. @@ -5639,7 +5642,7 @@ return true; } -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( +bool LoopVectorizationCostModel::isUnpredicatedEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into @@ -5663,26 +5666,23 @@ VectorizationFactor LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { + const VectorizationScheme MainLoopVF, const LoopVectorizationPlanner &LVP) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); return Result; } - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); + if (MainLoopVF.FoldTailByMasking) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue not required due to tail folding.\n";); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because the loop is " - "not a supported candidate.\n";); + if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF.Width)) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " + "is not a supported candidate.\n";); return Result; } @@ -5707,7 +5707,33 @@ return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (mayFoldTailByMasking()) { + // If we can fold the tail by masking to produce a predicated epilog, + // attempt to pick the scheme with the lowest cost providing it is more + // profitable than scalar. + VectorizationFactor BestFoldedVF = VectorizationFactor::Disabled(); + for (auto &VF : ProfitableVFs) { + if (VF.Scheme.FoldTailByMasking && + VF.Scheme.Width.isScalable() == MainLoopVF.Width.isScalable() && + ElementCount::isKnownLE(VF.Scheme.Width, MainLoopVF.Width) && + (Result.Scheme.Width.isScalar() || isMoreProfitable(VF, Result))) + BestFoldedVF = VF; + } + if (BestFoldedVF != VectorizationFactor::Disabled()) { + LLVM_DEBUG( + dbgs() << "LEV: Vectorizing predicated epilogue loop with VF = " + << BestFoldedVF.Scheme.Width << "\n";); + return BestFoldedVF; + } + } + + if (!isScalarEpilogueAllowed()) { + LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " + "epilogue is allowed.\n";); + return Result; + } + + if (!isUnpredicatedEpilogueVectorizationProfitable(MainLoopVF.Width)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5716,17 +5742,18 @@ // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. - ElementCount EstimatedRuntimeVF = MainLoopVF; - if (MainLoopVF.isScalable()) { - EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); + ElementCount EstimatedRuntimeVF = MainLoopVF.Width; + if (MainLoopVF.Width.isScalable()) { + EstimatedRuntimeVF = + ElementCount::getFixed(MainLoopVF.Width.getKnownMinValue()); if (std::optional VScale = getVScaleForTuning()) EstimatedRuntimeVF *= *VScale; } for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.isScalable() && + if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.Width.isScalable() && ElementCount::isKnownLT(NextVF.Scheme.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF)) && + ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF.Width)) && (Result.Scheme.Width.isScalar() || isMoreProfitable(NextVF, Result)) && LVP.hasPlanWithVF(false, NextVF.Scheme.Width)) Result = NextVF; @@ -5828,9 +5855,8 @@ // overhead. // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - - if (!isScalarEpilogueAllowed() || - TheLoop->getHeader()->getParent()->hasOptSize()) + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate || + VF.FoldTailByMasking || TheLoop->getHeader()->getParent()->hasOptSize()) return 1; // We used the distance for the interleave count. @@ -7949,10 +7975,10 @@ void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n" - << "Main Loop VF:" << EPI.MainLoopVF - << ", Main Loop UF:" << EPI.MainLoopUF + << "Main Loop VF:" << EPI.MainLoopVF << ", UF:" << EPI.MainLoopUF << ", Epilogue Loop VF:" << EPI.EpilogueVF - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + << ", UF:" << EPI.EpilogueUF + << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n"; }); } @@ -7980,9 +8006,13 @@ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; - Value *CheckMinIters = Builder.CreateICmp( - P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor), - "min.iters.check"); + Value *CheckMinIters = + (ForEpilogue && EPI.TailFoldEpilogue) + ? Builder.getFalse() + : Builder.CreateICmp( + P, Count, + createStepForVF(Builder, Count->getType(), VFactor, UFactor), + "min.iters.check"); if (!ForEpilogue) TCCheckBlock->setName("vector.main.loop.iter.check"); @@ -8136,6 +8166,17 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( BasicBlock *Bypass, BasicBlock *Insert) { + // If we are creating a predicated epilogue loop, always jump to it. + if (EPI.TailFoldEpilogue) { + ReplaceInstWithInst( + Insert->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, + ConstantInt::getFalse(Insert->getContext()))); + + LoopBypassBlocks.push_back(Insert); + return Insert; + } + assert(EPI.TripCount && "Expected trip count to have been safed in the first pass."); assert( @@ -8168,8 +8209,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { LLVM_DEBUG({ dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n" - << "Epilogue Loop VF:" << EPI.EpilogueVF - << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n"; + << "Epilogue Loop VF:" << EPI.EpilogueVF << ", UF:" << EPI.EpilogueUF + << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n"; }); } @@ -10554,15 +10595,14 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Scheme.Width, LVP); + CM.selectEpilogueVectorizationFactor(VF.Scheme, LVP); if (EpilogueVF.Scheme.Width.isVector()) { - // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - // TODOD: Predicated remainders EpilogueLoopVectorizationInfo EPI(VF.Scheme.Width, IC, - EpilogueVF.Scheme.Width, 1); + EpilogueVF.Scheme.Width, 1, + EpilogueVF.Scheme.FoldTailByMasking); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); @@ -10579,7 +10619,8 @@ ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestEpiPlan = LVP.getBestPlanFor({false, EPI.EpilogueVF}); + VPlan &BestEpiPlan = LVP.getBestPlanFor( + {EpilogueVF.Scheme.FoldTailByMasking, EPI.EpilogueVF}); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -644,7 +644,8 @@ assert(all_of(IV->users(), [](const VPUser *U) { if (isa(U) || - isa(U)) + isa(U) || + isa(U)) return true; auto *VPI = cast(U); return VPI->getOpcode() == @@ -653,8 +654,7 @@ VPInstruction::CanonicalIVIncrementNUW; }) && "the canonical IV should only be used by its increments or " - "ScalarIVSteps when " - "resetting the start value"); + "ScalarIVSteps when resetting the start value"); IV->setOperand(0, VPV); } } Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -11,12 +11,12 @@ ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16' ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1 +; DEBUG: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:vscale x 8, UF:1 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16' ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG-FORCED: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:8, UF:1 define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-LABEL: @main_vf_vscale_x_16( @@ -188,12 +188,12 @@ ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2' ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_2' ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced. ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass) -; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1 +; DEBUG-FORCED: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1 ; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because ; that's the minimum supported VF by SVE), we could still use a wide Index: llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll @@ -8,10 +8,12 @@ ; CHECK-LABEL: @raddshift2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -39,18 +41,50 @@ ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i16 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP29]] to i16 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 2 ; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ADD3]], 2 @@ -59,7 +93,7 @@ ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: @@ -103,10 +137,12 @@ ; CHECK-LABEL: @rhadd( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: +; CHECK-NEXT: br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] @@ -131,21 +167,53 @@ ; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison) +; CHECK-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], +; CHECK-NEXT: [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]] -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP14]] to i16 +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP28]] to i16 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]] -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; CHECK-NEXT: [[CONV2:%.*]] = zext i8 [[TMP29]] to i16 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 1 ; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]] ; CHECK-NEXT: [[SHR:%.*]] = lshr i16 [[ADD3]], 1 @@ -154,7 +222,7 @@ ; CHECK-NEXT: store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: @@ -207,10 +275,12 @@ ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[CONV2]], [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ] ; CHECK-NEXT: [[LAG_032:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ] ; CHECK-NEXT: [[CMP428:%.*]] = icmp slt i32 [[LAG_032]], [[CONV2]] -; CHECK-NEXT: br i1 [[CMP428]], label [[FOR_BODY6_PREHEADER:%.*]], label [[FOR_END]] -; CHECK: for.body6.preheader: +; CHECK-NEXT: br i1 [[CMP428]], label [[ITER_CHECK:%.*]], label [[FOR_END]] +; CHECK: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[INDVARS_IV]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[INDVARS_IV]], 8 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[INDVARS_IV]], [[N_MOD_VF]] @@ -236,37 +306,72 @@ ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[INDVARS_IV]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY6_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ] +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[CONV1027]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX4]], 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP13]], i32 [[INDVARS_IV]]) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP15]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) +; CHECK-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i32 [[TMP13]], [[LAG_032]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP19]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison) +; CHECK-NEXT: [[TMP20:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD6]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP16]] +; CHECK-NEXT: [[TMP22:%.*]] = ashr <4 x i32> [[TMP21]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP22]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]]) +; CHECK-NEXT: [[TMP25]] = add i32 [[TMP24]], [[VEC_PHI5]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add i32 [[INDEX4]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT9]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK: for.body6: -; CHECK-NEXT: [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ] +; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[I_029]] -; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP27]] to i32 ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[I_029]], [[LAG_032]] ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[ADD]] -; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2 -; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP28]] to i32 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]] ; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], [[CONV1027]] ; CHECK-NEXT: [[ADD11]] = add nsw i32 [[SHR]], [[ACCUMULATOR_030]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_029]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end.loopexit: -; CHECK-NEXT: [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[ADD11_LCSSA]], [[FOR_END_LOOPEXIT]] ] -; CHECK-NEXT: [[TMP15:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16 -; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP15]] to i16 +; CHECK-NEXT: [[TMP29:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16 +; CHECK-NEXT: [[CONV13:%.*]] = trunc i32 [[TMP29]] to i16 ; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[AUTOCORRDATA:%.*]], i32 [[LAG_032]] ; CHECK-NEXT: store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2 ; CHECK-NEXT: [[INC16]] = add nuw nsw i32 [[LAG_032]], 1 Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -1,19 +1,25 @@ -; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=NOTAILPRED ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED -; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT - +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" ; When TP is disabled, this test can vectorize with a VF of 16. ; When TP is enabled, this test should vectorize with a VF of 8. -; When both are allowed, the VF=16 without tail folding should win out. +; When both are allowed, the VF=16 without tail folding should win out with a +; predicated remainder. ; ; DEFAULT: load <16 x i8>, <16 x i8>* ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> ; DEFAULT: add <16 x i16> -; DEFAULT-NOT: llvm.masked.load -; DEFAULT-NOT: llvm.masked.store +; DEFAULT: vec.epilog.vector.body: +; DEFAULT: llvm.masked.load +; DEFAULT: llvm.masked.store +; +; NOTAILPRED: load <16 x i8>, <16 x i8>* +; NOTAILPRED: sext <16 x i8> %{{.*}} to <16 x i16> +; NOTAILPRED: add <16 x i16> +; NOTAILPRED-NOT: vec.epilog.vector.body: ; ; TAILPRED: llvm.masked.load.v8i8.p0v8i8 ; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16> Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll +++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll @@ -6,7 +6,7 @@ ; Currently we cannot handle scalable vectorization factors. ; CHECK: LV: Checking a loop in 'f1' ; CHECK: LEV: Epilogue vectorization factor is forced. -; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1 +; CHECK: Epilogue Loop VF:2, UF:1 define void @f1(ptr %A) { entry: