diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -475,7 +475,8 @@ /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State); + VPTransformState &State, + Intrinsic::ID VectorIntrinsicID); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -4152,9 +4153,9 @@ return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def, - VPUser &ArgOperands, - VPTransformState &State) { +void InnerLoopVectorizer::widenCallInstruction( + CallInst &CI, VPValue *Def, VPUser &ArgOperands, VPTransformState &State, + Intrinsic::ID VectorIntrinsicID) { assert(!isa(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); @@ -4165,19 +4166,6 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI); - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = - ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); - assert((IntrinsicCost.isValid() || CallCost.isValid()) && - "Either the intrinsic cost or vector call cost must be valid"); - for (unsigned Part = 0; Part < UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; @@ -4185,18 +4173,18 @@ // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || - !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) + if (VectorIntrinsicID == Intrinsic::not_intrinsic || + !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), Part); else Arg = State.get(I.value(), VPIteration(0, 0)); - if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } Function *VectorF; - if (UseVectorIntrinsic) { + if (VectorIntrinsicID != Intrinsic::not_intrinsic) { // Use vector version of the intrinsic. if (VF.isVector()) TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF); @@ -8318,23 +8306,37 @@ ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - }; - - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + auto CanUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + // The following case may be scalarized depending on the VF. + // The flag shows whether we can use a usual Call for vectorized + // version of the instruction. + bool NeedToScalarize = false; + CM.getVectorCallCost(CI, VF, NeedToScalarize); + return !NeedToScalarize; + }, + Range); + // Is it beneficial to perform intrinsic call compared to lib call? + bool CanUseVectorIntrinsic = + ID != Intrinsic::not_intrinsic && + LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + bool NeedToScalarize = false; + // Is it beneficial to perform intrinsic call compared to lib call? + InstructionCost CallCost = + CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); + return IntrinsicCost <= CallCost; + }, + Range); + + if (!CanUseVectorIntrinsic && !CanUseVectorCall) return nullptr; ArrayRef Ops = Operands.take_front(CI->arg_size()); - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); + return new VPWidenCallRecipe( + *CI, make_range(Ops.begin(), Ops.end()), + CanUseVectorIntrinsic ? ID : Intrinsic::not_intrinsic); } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { @@ -9311,7 +9313,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { State.ILV->widenCallInstruction(*cast(getUnderlyingInstr()), this, - *this, State); + *this, State, VectorIntrinsicID); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -946,12 +946,15 @@ /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { + Intrinsic::ID VectorIntrinsicID; public: template - VPWidenCallRecipe(CallInst &I, iterator_range CallArguments) + VPWidenCallRecipe(CallInst &I, iterator_range CallArguments, + Intrinsic::ID VectorIntrinsicID) : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments), - VPValue(VPValue::VPVWidenCallSC, &I, this) {} + VPValue(VPValue::VPVWidenCallSC, &I, this), + VectorIntrinsicID(VectorIntrinsicID) {} ~VPWidenCallRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -450,6 +450,11 @@ O << "call @" << CI->getCalledFunction()->getName() << "("; printOperands(O, SlotTracker); O << ")"; + + if (VectorIntrinsicID == Intrinsic::not_intrinsic) + O << " (using library function)"; + else + O << " (using vector intrinsic)"; } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -74,7 +74,7 @@ GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); } else if (CallInst *CI = dyn_cast(Inst)) { NewRecipe = - new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args())); + new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()), true); } else if (SelectInst *SI = dyn_cast(Inst)) { bool InvariantCond = SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -6,7 +6,7 @@ target triple = "arm64-apple-ios" ; CHECK-LABEL: LV: Checking a loop in 'test' -; CHECK: VPlan 'Initial VPlan for VF={2,4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -19,7 +19,7 @@ ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3> ; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l> -; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) +; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2> @@ -31,6 +31,33 @@ ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors ; CHECK-NEXT: } + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3> +; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l> +; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic) +; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3> +; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%10> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; ; define void @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<0\>, ir\<1\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, vp\<[[STEPS]]\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + -; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + +; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>) (using vector intrinsic)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, vp\<[[STEPS]]\>\l" + ; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT:%.+]]\> = VF * UF +(nuw) vp\<[[CAN_IV]]\>\l" + diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -806,7 +806,7 @@ SmallVector Args; Args.push_back(&Op1); Args.push_back(&Op2); - VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end())); + VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1065,7 +1065,8 @@ SmallVector Args; Args.push_back(&Op1); Args.push_back(&Op2); - VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end())); + VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), + false); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory());