diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1261,9 +1261,11 @@ const LoopVectorizationPlanner &LVP); /// Setup cost-based decisions for user vectorization factor. - void selectUserVectorizationFactor(ElementCount UserVF) { + /// \return true if the UserVF is a feasible VF to be chosen. + bool selectUserVectorizationFactor(ElementCount UserVF) { collectUniformsAndScalars(UserVF); collectInstsToScalarize(UserVF); + return expectedCost(UserVF).first.isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -5725,8 +5727,14 @@ auto MaxSafeUserVF = UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; - if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) - return UserVF; + if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { + // If `VF=vscale x N` is safe, then so is `VF=N` + if (UserVF.isScalable()) + return FixedScalableVFPair( + ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); + else + return UserVF; + } assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); @@ -6072,17 +6080,11 @@ if (i.isScalar()) continue; - // Notice that the vector loop needs to be executed less times, so - // we need to divide the cost of the vector loops by the width of - // the vector elements. VectorizationCostTy C = expectedCost(i); - - assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); VectorizationFactor Candidate(i, C.first); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " - << (*Candidate.Cost.getValue() / - Candidate.Width.getKnownMinValue()) + << (Candidate.Cost / Candidate.Width.getKnownMinValue()) << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") << ".\n"); @@ -6109,8 +6111,7 @@ } LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) - dbgs() + ChosenFactor.Cost >= ScalarCost.Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); @@ -6438,8 +6439,9 @@ // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); - LoopCost = *expectedCost(VF).first.getValue(); + InstructionCost C = expectedCost(VF).first; + assert(C.isValid() && "Expected to have chosen a VF with valid cost"); + LoopCost = *C.getValue(); } assert(LoopCost && "Non-zero loop cost expected"); @@ -7295,6 +7297,8 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, ElementCount VF) const { + // There is no mechanism yet to create a scalable scalarization loop, + // so this is currently Invalid. if (VF.isScalable()) return InstructionCost::getInvalid(); @@ -8013,17 +8017,19 @@ UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); if (!UserVF.isZero() && UserVFIsLegal) { - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - CM.selectUserVectorizationFactor(UserVF); - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + if (CM.selectUserVectorizationFactor(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); + return {{UserVF, 0}}; + } else + reportVectorizationInfo("UserVF ignored because of invalid costs.", + "InvalidCost", ORE, OrigLoop); } // Populate the set of Vectorization Factor Candidates. @@ -8798,8 +8804,6 @@ InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert((IntrinsicCost.isValid() || CallCost.isValid()) && - "Either the intrinsic cost or vector call cost must be valid"); return UseVectorIntrinsic || !NeedToScalarize; }; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -75,7 +75,7 @@ ; CHECK-LABEL: @vec_intrinsic ; CHECK: vector.body: ; CHECK: %[[LOAD:.*]] = load , * -; CHECK: call fast @sin_vec( %[[LOAD]]) +; CHECK: call fast @sin_vec_nxv2f64( %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end @@ -95,17 +95,90 @@ ret void } +define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { +; CHECK: @vec_sin_no_mapping +; CHECK: call fast <2 x float> @llvm.sin.v2f32 +; CHECK-NOT: @llvm.sin.v2f32 +; CHECK-NOT: @llvm.sqrt.nxv2f32 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 + %0 = load float, float* %arrayidx, align 4 + %1 = tail call fast float @llvm.sqrt.f32(float %0) + %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 + store float %1, float* %arrayidx1, align 4 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 + +for.cond.cleanup: ; preds = %for.body + ret void +} + + declare double @foo(double) declare i64 @bar(i64*) declare double @llvm.sin.f64(double) +declare float @llvm.sin.f32(float) +declare float @llvm.sqrt.f32(float) declare @foo_vec() declare @bar_vec() -declare @sin_vec() +declare @sin_vec_nxv2f64() +declare <2 x double> @sin_vec_v2f64(<2 x double>) attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" } attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" } -attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" } +attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" } +attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" } !1 = distinct !{!1, !2, !3} !2 = !{!"llvm.loop.vectorize.width", i32 2}