diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4440,9 +4440,9 @@ VPValue *StartVPV, VPValue *Def, VPTransformState &State) { PHINode *P = cast(PN); - if (EnableVPlanNativePath) { - // Currently we enter here in the VPlan-native path for non-induction - // PHIs where all control flow is uniform. We simply widen these PHIs. + if (EnableVPlanNativePath && !OrigLoop->isInnermost()) { + // We enter here in the VPlan-native path and when the loop is not the + // innermost loop. We handle non-induction PHIs here and simply widen them. // Create a vector phi with no operands - the vector phi operands will be // set at the end of vector code generation. Type *VecTy = (State.VF.isScalar()) @@ -5181,7 +5181,8 @@ // A uniform memory op is itself uniform. We exclude uniform stores // here as they demand the last lane, not the first one. if (isa(I) && Legal->isUniformMemOp(*I)) { - assert(WideningDecision == CM_Scalarize); + assert(WideningDecision == CM_Scalarize || + WideningDecision == CM_GatherScatter); return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -599,7 +599,8 @@ assert((EnableVPlanNativePath || isa(LastBB->getTerminator())) && "Expected InnerLoop VPlan CFG to terminate with unreachable"); - assert((!EnableVPlanNativePath || isa(LastBB->getTerminator())) && + assert((!EnableVPlanNativePath || + (L->isInnermost() || isa(LastBB->getTerminator()))) && "Expected VPlan CFG to terminate with branch in NativePath"); LastBB->getTerminator()->eraseFromParent(); BranchInst::Create(VectorLatchBB, LastBB); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-single-loop.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s + +; Test that when VPlan native path is enabled and no explicit loop is marked to +; be vectorized that single loop will be vectorized without any issues. +; See PR42592 (https://bugs.llvm.org/show_bug.cgi?id=42592). + +target triple = "x86_64-unknown-linux-gnu" +define void @kernel(float* nocapture readonly %0, float* nocapture readonly %1, float* nocapture %2, i64 %3, i64 %4) { +; CHECK-LABEL: @kernel( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4:%.*]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 [[TMP3:%.*]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[TMP6]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float*> [[DOTSPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = add nuw <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP5]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[DOTSPLAT]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[TMP8:%.*]] = fdiv <4 x float> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER1]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP2:%.*]], <4 x i64> [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> [[TMP8]], <4 x float*> [[TMP9]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[KERNEL_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_0:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[R:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[I_0]] +; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 [[TMP3]] +; CHECK-NEXT: [[AA:%.*]] = add nuw i64 [[I_0]], 2 +; CHECK-NEXT: [[C:%.*]] = load float, float* [[ARRAYIDX_I]], align 4 +; CHECK-NEXT: [[D:%.*]] = load float, float* [[ARRAYIDX5_I]], align 4 +; CHECK-NEXT: [[DIV_I:%.*]] = fdiv float [[C]], [[D]] +; CHECK-NEXT: [[ARRAYIDX9_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[I_0]] +; CHECK-NEXT: store float [[DIV_I]], float* [[ARRAYIDX9_I]], align 4 +; CHECK-NEXT: [[R]] = add nuw i64 [[I_0]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[R]], [[TMP4]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[KERNEL_EXIT]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; CHECK: kernel.exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %i.0 = phi i64 [ 0, %entry ], [ %r, %for.body ] + %arrayidx.i = getelementptr inbounds float, float* %0, i64 %i.0 + %arrayidx5.i = getelementptr inbounds float, float* %1, i64 %3 + %aa = add nuw i64 %i.0, 2 + %c = load float, float* %arrayidx.i, align 4 + %d = load float, float* %arrayidx5.i, align 4 + %div.i = fdiv float %c, %d + %arrayidx9.i = getelementptr inbounds float, float* %2, i64 %i.0 + store float %div.i, float* %arrayidx9.i, align 4 + %r = add nuw i64 %i.0, 1 + %exitcond.not = icmp eq i64 %r, %4 + br i1 %exitcond.not, label %kernel.exit, label %for.body + +kernel.exit: + ret void +}