diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1636,6 +1636,11 @@ Scalars.clear(); } + /// \return true if any of the instructions has an invalid cost. Any + /// such instructions are captured in \p Invalid. + bool hasInvalidCosts(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + private: unsigned NumPredStores = 0; @@ -1675,7 +1680,9 @@ /// not matter because we use the 'cost' units to compare different /// vector widths. The cost that is returned is *not* normalized by /// the factor width. - VectorizationCostTy expectedCost(ElementCount VF); + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -5694,8 +5701,14 @@ auto MaxSafeUserVF = UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF; - if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) - return UserVF; + if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) { + // If `VF=vscale x N` is safe, then so is `VF=N` + if (UserVF.isScalable()) + return FixedScalableVFPair( + ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF); + else + return UserVF; + } assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF)); @@ -6041,17 +6054,25 @@ if (i.isScalar()) continue; - // Notice that the vector loop needs to be executed less times, so - // we need to divide the cost of the vector loops by the width of - // the vector elements. - VectorizationCostTy C = expectedCost(i); + SmallVector InvalidCosts; + VectorizationCostTy C = expectedCost(i, &InvalidCosts); + if (!C.first.isValid()) { + // Print an opt-report explaining why the VF is not considered. + std::string Message; + raw_string_ostream OS(Message); + OS << "Not vectorizing with VF=" << i + << " because of instructions with invalid cost:\n"; + for (const auto *I : InvalidCosts) + OS << "\t" << *I << "\n"; + OS.flush(); + reportVectorizationInfo(Message, "InvalidCost", ORE, TheLoop, + InvalidCosts[0]); + } - assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); VectorizationFactor Candidate(i, C.first); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " - << (*Candidate.Cost.getValue() / - Candidate.Width.getKnownMinValue()) + << (Candidate.Cost / Candidate.Width.getKnownMinValue()) << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") << ".\n"); @@ -6078,8 +6099,7 @@ } LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue()) - dbgs() + ChosenFactor.Cost >= ScalarCost.Cost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); @@ -6407,8 +6427,9 @@ // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { - assert(expectedCost(VF).first.isValid() && "Expected a valid cost"); - LoopCost = *expectedCost(VF).first.getValue(); + InstructionCost C = expectedCost(VF).first; + assert(C.isValid() && "Expected to have chosen a VF with valid cost"); + LoopCost = *C.getValue(); } assert(LoopCost && "Non-zero loop cost expected"); @@ -6850,8 +6871,14 @@ return *Discount.getValue(); } +bool LoopVectorizationCostModel::hasInvalidCosts( + ElementCount VF, SmallVectorImpl *Invalid) { + return !expectedCost(VF, Invalid).first.isValid(); +} + LoopVectorizationCostModel::VectorizationCostTy -LoopVectorizationCostModel::expectedCost(ElementCount VF) { +LoopVectorizationCostModel::expectedCost( + ElementCount VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; // For each block. @@ -6871,6 +6898,10 @@ if (ForceTargetInstructionCost.getNumOccurrences() > 0) C.first = InstructionCost(ForceTargetInstructionCost); + // Keep a list of instructions with invalid costs. + if (Invalid && !C.first.isValid()) + Invalid->push_back(&I); + BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first @@ -7264,6 +7295,8 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, ElementCount VF) const { + // There is no mechanism yet to create a scalable scalarization loop, + // so this is currently Invalid. if (VF.isScalable()) return InstructionCost::getInvalid(); @@ -7982,17 +8015,21 @@ UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF; bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF); if (!UserVF.isZero() && UserVFIsLegal) { - LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max") - << " VF " << UserVF << ".\n"); assert(isPowerOf2_32(UserVF.getKnownMinValue()) && "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0}}; + if (!CM.hasInvalidCosts(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Using " + << "user VF " << UserVF << ".\n"); + CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(UserVF, UserVF); + LLVM_DEBUG(printPlans(dbgs())); + return {{UserVF, 0}}; + } else + reportVectorizationInfo("UserVF ignored because of invalid costs.", + "InvalidCost", ORE, OrigLoop); } // Populate the set of Vectorization Factor Candidates. @@ -8758,8 +8795,6 @@ InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert((IntrinsicCost.isValid() || CallCost.isValid()) && - "Either the intrinsic cost or vector call cost must be valid"); return UseVectorIntrinsic || !NeedToScalarize; }; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on < %s | FileCheck %s +; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu -scalable-vectorization=on \ +; RUN: -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) { ; CHECK-LABEL: @vec_load @@ -75,7 +77,7 @@ ; CHECK-LABEL: @vec_intrinsic ; CHECK: vector.body: ; CHECK: %[[LOAD:.*]] = load , * -; CHECK: call fast @sin_vec( %[[LOAD]]) +; CHECK: call fast @sin_vec_nxv2f64( %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end @@ -95,17 +97,100 @@ ret void } +; CHECK-REMARKS: UserVF ignored because of invalid costs. +; CHECK-REMARKS-NEXT: Not vectorizing with VF=vscale x 1 because of instructions with invalid cost: +; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0) +; CHECK-REMARKS: Not vectorizing with VF=vscale x 2 because of instructions with invalid cost: +; CHECK-REMARKS-NEXT: %1 = tail call fast float @llvm.sin.f32(float %0) +define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) { +; CHECK: @vec_sin_no_mapping +; CHECK: call fast <2 x float> @llvm.sin.v2f32 +; CHECK-NOT: @llvm.sin.v2f32 +; CHECK-NOT: @llvm.sqrt.nxv2f32 +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07 + %0 = load float, float* %arrayidx, align 4 + %1 = tail call fast float @llvm.sqrt.f32(float %0) + %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07 + store float %1, float* %arrayidx1, align 4 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1 + +for.cond.cleanup: ; preds = %for.body + ret void +} + + declare double @foo(double) declare i64 @bar(i64*) declare double @llvm.sin.f64(double) +declare float @llvm.sin.f32(float) +declare float @llvm.sqrt.f32(float) declare @foo_vec() declare @bar_vec() -declare @sin_vec() +declare @sin_vec_nxv2f64() +declare <2 x double> @sin_vec_v2f64(<2 x double>) attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" } attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" } -attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" } +attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" } +attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" } !1 = distinct !{!1, !2, !3} !2 = !{!"llvm.loop.vectorize.width", i32 2}