diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1200,7 +1200,8 @@ InterleavedAccessInfo &IAI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) {} + Hints(Hints), InterleaveInfo(IAI), + UseVPlanNativePath(EnableVPlanNativePath && !L->isInnermost()) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. @@ -1283,7 +1284,7 @@ // Cost model is not run in the VPlan-native path - return conservative // result until this changes. - if (EnableVPlanNativePath) + if (UseVPlanNativePath) return false; auto Scalars = InstsToScalarize.find(VF); @@ -1299,7 +1300,7 @@ // Cost model is not run in the VPlan-native path - return conservative // result until this changes. - if (EnableVPlanNativePath) + if (UseVPlanNativePath) return false; auto UniformsPerVF = Uniforms.find(VF); @@ -1315,7 +1316,7 @@ // Cost model is not run in the VPlan-native path - return conservative // result until this changes. - if (EnableVPlanNativePath) + if (UseVPlanNativePath) return false; auto ScalarsPerVF = Scalars.find(VF); @@ -1375,7 +1376,7 @@ assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. - if (EnableVPlanNativePath) + if (UseVPlanNativePath) return CM_GatherScatter; std::pair InstOnVF = std::make_pair(I, VF); @@ -1832,6 +1833,9 @@ /// Profitable vector factors. SmallVector ProfitableVFs; + + /// Controls whether the VPlan native path is used or not. + bool UseVPlanNativePath; }; } // end namespace llvm @@ -3039,7 +3043,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // VPlan-native path does not do any analysis for runtime checks currently. - if (EnableVPlanNativePath) + if (Cost->UseVPlanNativePath) return; // Reuse existing vector loop preheader for runtime memory checks. @@ -3801,7 +3805,7 @@ // Fix widened non-induction PHIs by setting up the PHI operands. if (OrigPHIsToFix.size()) { - assert(EnableVPlanNativePath && + assert(Cost->UseVPlanNativePath && "Unexpected non-induction PHIs for fixup in non VPlan-native path"); fixNonInductionPHIs(State); } @@ -4440,9 +4444,9 @@ VPValue *StartVPV, VPValue *Def, VPTransformState &State) { PHINode *P = cast(PN); - if (EnableVPlanNativePath) { - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. + if (Cost->UseVPlanNativePath) { + // We enter here in the VPlan-native path and when the loop is not the + // innermost loop. We handle non-induction PHIs here and simply widen them. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = (State.VF.isScalar()) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -599,7 +599,8 @@ assert((EnableVPlanNativePath || isa(LastBB->getTerminator())) && "Expected InnerLoop VPlan CFG to terminate with unreachable"); - assert((!EnableVPlanNativePath || isa(LastBB->getTerminator())) && + assert((!EnableVPlanNativePath || + (L->isInnermost() || isa(LastBB->getTerminator()))) && "Expected VPlan CFG to terminate with branch in NativePath"); LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s + +; Test that when VPlan native path is enabled and no explicit loop is marked to +; be vectorized that innermost loop will be vectorized wihtout any issues. The +; result of the vectorization should be the same as using inner loop +; vectorization wihtout enabling VPlan native path flag. +; See PR42592 (https://bugs.llvm.org/show_bug.cgi?id=42592). + +target triple = "x86_64-unknown-linux-gnu" +define void @kernel(float* nocapture readonly %0, float* nocapture readonly %1, float* nocapture %2, i64 %3, i64 %4) { +; CHECK-LABEL: @kernel( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP2:%.*]] to i8* +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP0:%.*]] to i8* +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8* +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8* +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[TMP1:%.*]], i64 [[TMP3:%.*]] +; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8* +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr float, float* [[TMP1]], i64 [[TMP3]] +; CHECK-NEXT: [[SCEVGEP67:%.*]] = bitcast float* [[SCEVGEP6]] to i8* +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCEVGEP67]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP5]], [[SCEVGEP23]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TMP6]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND08:%.*]] = icmp ult i8* [[TMP5]], [[UGLYGEP]] +; CHECK-NEXT: [[BOUND19:%.*]] = icmp ult i8* [[SCEVGEP45]], [[SCEVGEP1]] +; CHECK-NEXT: [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT10]] +; CHECK-NEXT: [[MEMCHECK_CONFLICT:%.*]] = and i1 [[CONFLICT_RDX]], true +; CHECK-NEXT: br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP13:%.*]] = add nuw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4, !alias.scope !0 +; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP12]], align 4, !alias.scope !3 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP16]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fdiv <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* [[TMP20]], align 4, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[KERNEL_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_0:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[R:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[I_0]] +; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 [[TMP3]] +; CHECK-NEXT: [[AA:%.*]] = add nuw i64 [[I_0]], 2 +; CHECK-NEXT: [[C:%.*]] = load float, float* [[ARRAYIDX_I]], align 4 +; CHECK-NEXT: [[D:%.*]] = load float, float* [[ARRAYIDX5_I]], align 4 +; CHECK-NEXT: [[DIV_I:%.*]] = fdiv float [[C]], [[D]] +; CHECK-NEXT: [[ARRAYIDX9_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[I_0]] +; CHECK-NEXT: store float [[DIV_I]], float* [[ARRAYIDX9_I]], align 4 +; CHECK-NEXT: [[R]] = add nuw i64 [[I_0]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[R]], [[TMP4]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[KERNEL_EXIT]], label [[FOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK: kernel.exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i.0 = phi i64 [ 0, %entry ], [ %r, %for.body ] + %arrayidx.i = getelementptr inbounds float, float* %0, i64 %i.0 + %arrayidx5.i = getelementptr inbounds float, float* %1, i64 %3 + %aa = add nuw i64 %i.0, 2 + %c = load float, float* %arrayidx.i, align 4 + %d = load float, float* %arrayidx5.i, align 4 + %div.i = fdiv float %c, %d + %arrayidx9.i = getelementptr inbounds float, float* %2, i64 %i.0 + store float %div.i, float* %arrayidx9.i, align 4 + %r = add nuw i64 %i.0, 1 + %exitcond.not = icmp eq i64 %r, %4 + br i1 %exitcond.not, label %kernel.exit, label %for.body + +kernel.exit: + ret void +}