diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -477,7 +477,7 @@ /// loop. /// In the case of epilogue vectorization, this function is overriden to /// handle the more complex control flow around the loops. - virtual BasicBlock *createVectorizedLoopSkeleton(); + virtual std::pair createVectorizedLoopSkeleton(); /// Widen a single instruction within the innermost loop. void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, @@ -964,14 +964,16 @@ // Override this function to handle the more complex control flow around the // three loops. - BasicBlock *createVectorizedLoopSkeleton() final override { + std::pair + createVectorizedLoopSkeleton() final override { return createEpilogueVectorizedLoopSkeleton(); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. - virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0; + virtual std::pair + createEpilogueVectorizedLoopSkeleton() = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -998,7 +1000,8 @@ EPI, LVL, CM, BFI, PSI) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -1028,7 +1031,8 @@ EPI, LVL, CM, BFI, PSI) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - BasicBlock *createEpilogueVectorizedLoopSkeleton() final override; + std::pair + createEpilogueVectorizedLoopSkeleton() final override; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -3448,7 +3452,8 @@ return LoopVectorPreHeader; } -BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { +std::pair +InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3530,7 +3535,7 @@ // Emit phis for the new starting index of the scalar loop. createInductionResumeValues(Lp, CountRoundDown); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -5716,14 +5721,6 @@ return false; } - // Induction variables that are widened require special handling that is - // currently not supported. - if (any_of(Legal->getInductionVars(), [&](auto &Entry) { - return !(isScalarAfterVectorization(Entry.first, VF) || - isProfitableToScalarize(Entry.first, VF)); - })) - return false; - return true; } @@ -7397,12 +7394,21 @@ VPTransformState State{*BestVF, BestUF, LI, DT, ILV.Builder, ILV.VectorLoopValueMap, &ILV, CallbackILV}; - State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(); + Value *IndStart = nullptr; + std::tie(State.CFG.PrevBB, IndStart) = ILV.createVectorizedLoopSkeleton(); State.TripCount = ILV.getOrCreateTripCount(nullptr); State.CanonicalIV = ILV.Induction; ILV.printDebugTracesAtStart(); + if (IndStart) { + auto *VPBB = VPlans.front()->getEntry()->getEntryBasicBlock(); + for (auto &R : *VPBB) { + if (auto *Ind = dyn_cast(&R)) + Ind->setStart(IndStart); + } + } + //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -7532,7 +7538,8 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +std::pair +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton(""); @@ -7586,7 +7593,7 @@ // because the vplan in the second pass still contains the inductions from the // original loop. - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7664,7 +7671,7 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. -BasicBlock * +std::pair EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { MDNode *OrigLoopID = OrigLoop->getLoopID(); Loop *Lp = createVectorLoopSkeleton("vec.epilog."); @@ -7745,7 +7752,7 @@ EPI.VectorTripCount} /* AdditionalBypass */); AddRuntimeUnrollDisableMetaData(Lp); - return completeLoopSkeleton(Lp, OrigLoopID); + return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal}; } BasicBlock * @@ -9185,8 +9192,10 @@ LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF); EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; + EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI); + LVP.executePlan(EpilogILV, DT); ++LoopsEpilogueVectorized; diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-widened-inductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-widened-inductions.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-widened-inductions.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -epilogue-vectorization-force-VF=4 -force-vector-interleave=2 -S %s | FileCheck %s +; RUN: opt -passes='loop-vectorize' -epilogue-vectorization-force-VF=4 -force-vector-interleave=2 -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-n32:64-v256:256:256-v512:512:512" + +define void @f3(i8* noalias %A, i64 %n) { +; CHECK-LABEL: @f3( +; CHECK-NEXT: iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 +; CHECK-NEXT: [[STEP_ADD4:%.*]] = add <4 x i32> [[VEC_IND3]], +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[VEC_IND3]] to <4 x i8> +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[STEP_ADD4]] to <4 x i8> +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP8]], <4 x i8>* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 4 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP15]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[N]], [[N_MOD_VF7]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[VEC_EPILOG_RESUME_VAL]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[VEC_EPILOG_RESUME_VAL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT15:%.*]] = insertelement <4 x i32> undef, i32 [[TMP17]], i32 0 +; CHECK-NEXT: [[DOTSPLAT16:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT15]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION17:%.*]] = add <4 x i32> [[DOTSPLAT16]], +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND12:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND18:%.*]] = phi <4 x i32> [ [[INDUCTION17]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT20:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX9]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX9]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX9]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX9]], 3 +; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[VEC_IND18]] to <4 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[TMP25]], align 1 +; CHECK-NEXT: [[INDEX_NEXT10]] = add i64 [[INDEX9]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT14]] = add <4 x i64> [[VEC_IND12]], +; CHECK-NEXT: [[VEC_IND_NEXT20]] = add <4 x i32> [[VEC_IND18]], +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[TMP27]] to i8 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]] +; CHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]] +; CHECK: for.end.loopexit.loopexit: +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %0 = trunc i64 %iv to i32 + %conv = trunc i32 %0 to i8 + %arrayidx = getelementptr inbounds i8, i8* %A, i64 %iv + store i8 %conv, i8* %arrayidx, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp ne i64 %iv.next, %n + br i1 %exitcond, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +}