diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -475,7 +475,8 @@ /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands, - VPTransformState &State); + VPTransformState &State, + Intrinsic::ID VectorIntrinsicID); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -4152,9 +4153,9 @@ return Cost->useOrderedReductions(RdxDesc); } -void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def, - VPUser &ArgOperands, - VPTransformState &State) { +void InnerLoopVectorizer::widenCallInstruction( + CallInst &CI, VPValue *Def, VPUser &ArgOperands, VPTransformState &State, + Intrinsic::ID VectorIntrinsicID) { assert(!isa(CI) && "DbgInfoIntrinsic should have been dropped during VPlan construction"); State.setDebugLocFromInst(&CI); @@ -4163,21 +4164,6 @@ for (Value *ArgOperand : CI.args()) Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue())); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI); - - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = - ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); - assert((IntrinsicCost.isValid() || CallCost.isValid()) && - "Either the intrinsic cost or vector call cost must be valid"); - for (unsigned Part = 0; Part < UF; ++Part) { SmallVector TysForDecl = {CI.getType()}; SmallVector Args; @@ -4185,23 +4171,23 @@ // Some intrinsics have a scalar argument - don't replace it with a // vector. Value *Arg; - if (!UseVectorIntrinsic || - !isVectorIntrinsicWithScalarOpAtArg(ID, I.index())) + if (!VectorIntrinsicID || + !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index())) Arg = State.get(I.value(), Part); else Arg = State.get(I.value(), VPIteration(0, 0)); - if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index())) + if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index())) TysForDecl.push_back(Arg->getType()); Args.push_back(Arg); } Function *VectorF; - if (UseVectorIntrinsic) { + if (VectorIntrinsicID) { // Use vector version of the intrinsic. if (VF.isVector()) TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF); Module *M = State.Builder.GetInsertBlock()->getModule(); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); } else { // Use vector version of the function call. @@ -8320,23 +8306,42 @@ ID == Intrinsic::experimental_noalias_scope_decl)) return nullptr; - auto willWiden = [&](ElementCount VF) -> bool { - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0; - bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - }; + ArrayRef Ops = Operands.take_front(CI->arg_size()); - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) - return nullptr; + // Is it beneficial to perform intrinsic call compared to lib call? + bool ShouldUseVectorIntrinsic = + ID && LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + bool NeedToScalarize = false; + // Is it beneficial to perform intrinsic call compared to lib + // call? + InstructionCost CallCost = + CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost IntrinsicCost = + CM.getVectorIntrinsicCost(CI, VF); + return IntrinsicCost <= CallCost; + }, + Range); + if (ShouldUseVectorIntrinsic) + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); + + // Is better to call a vectorized version of the function than to to scalarize + // the call? + auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](ElementCount VF) -> bool { + // The following case may be scalarized depending on the VF. + // The flag shows whether we can use a usual Call for vectorized + // version of the instruction. + bool NeedToScalarize = false; + CM.getVectorCallCost(CI, VF, NeedToScalarize); + return !NeedToScalarize; + }, + Range); + if (ShouldUseVectorCall) + return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), + Intrinsic::not_intrinsic); - ArrayRef Ops = Operands.take_front(CI->arg_size()); - return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end())); + return nullptr; } bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { @@ -9160,7 +9165,7 @@ VPlanTransforms::VPInstructionsToVPRecipes( OrigLoop, Plan, [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, - DeadInstructions, *PSE.getSE()); + DeadInstructions, *PSE.getSE(), *TLI); // Remove the existing terminator of the exiting block of the top-most region. // A BranchOnCount will be added instead when adding the canonical IV recipes. @@ -9313,7 +9318,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) { State.ILV->widenCallInstruction(*cast(getUnderlyingInstr()), this, - *this, State); + *this, State, VectorIntrinsicID); } void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -64,6 +64,10 @@ class VPReplicateRecipe; class VPlanSlp; +namespace Intrinsic { +typedef unsigned ID; +} + /// Returns a calculation for the total number of elements for a given \p VF. /// For fixed width vectors this value is a constant, whereas for scalable /// vectors it is an expression determined at runtime. @@ -946,12 +950,17 @@ /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { + /// ID of the vector intrinsic to call when widening the call. If set the + /// Intrinsic::not_intrinsic, a library call will be used instead. + Intrinsic::ID VectorIntrinsicID; public: template - VPWidenCallRecipe(CallInst &I, iterator_range CallArguments) + VPWidenCallRecipe(CallInst &I, iterator_range CallArguments, + Intrinsic::ID VectorIntrinsicID) : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments), - VPValue(VPValue::VPVWidenCallSC, &I, this) {} + VPValue(VPValue::VPVWidenCallSC, &I, this), + VectorIntrinsicID(VectorIntrinsicID) {} ~VPWidenCallRecipe() override = default; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -450,6 +450,11 @@ O << "call @" << CI->getCalledFunction()->getName() << "("; printOperands(O, SlotTracker); O << ")"; + + if (VectorIntrinsicID) + O << " (using vector intrinsic)"; + else + O << " (using library function)"; } void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,6 +23,7 @@ class PHINode; class ScalarEvolution; class Loop; +class TargetLibraryInfo; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding @@ -32,7 +33,7 @@ function_ref GetIntOrFpInductionDescriptor, SmallPtrSetImpl &DeadInstructions, - ScalarEvolution &SE); + ScalarEvolution &SE, const TargetLibraryInfo &TLI); static bool sinkScalarOperands(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -15,6 +15,8 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/Intrinsics.h" using namespace llvm; @@ -22,7 +24,8 @@ Loop *OrigLoop, VPlanPtr &Plan, function_ref GetIntOrFpInductionDescriptor, - SmallPtrSetImpl &DeadInstructions, ScalarEvolution &SE) { + SmallPtrSetImpl &DeadInstructions, ScalarEvolution &SE, + const TargetLibraryInfo &TLI) { ReversePostOrderTraversal> RPOT(Plan->getEntry()); @@ -74,7 +77,8 @@ GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); } else if (CallInst *CI = dyn_cast(Inst)) { NewRecipe = - new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args())); + new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->args()), + getVectorIntrinsicIDForCall(CI, &TLI)); } else if (SelectInst *SI = dyn_cast(Inst)) { bool InvariantCond = SE.isLoopInvariant(SE.getSCEV(SI->getOperand(0)), OrigLoop); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -6,7 +6,7 @@ target triple = "arm64-apple-ios" ; CHECK-LABEL: LV: Checking a loop in 'test' -; CHECK: VPlan 'Initial VPlan for VF={2,4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<%1> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: @@ -19,7 +19,7 @@ ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3> ; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> ; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l> -; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) +; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3> ; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> ; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2> @@ -31,6 +31,33 @@ ; CHECK-NEXT: middle.block: ; CHECK-NEXT: No successors ; CHECK-NEXT: } + +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%2> = CANONICAL-INDUCTION +; CHECK-NEXT: vp<%3> = SCALAR-STEPS vp<%2>, ir<0>, ir<1> +; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr ir<%src>, vp<%3> +; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%conv> = fpext ir<%l> +; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic) +; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr ir<%dst>, vp<%3> +; CHECK-NEXT: REPLICATE store ir<%s>, ir<%gep.dst> +; CHECK-NEXT: EMIT vp<%10> = VF * UF +(nuw) vp<%2> +; CHECK-NEXT: EMIT branch-on-count vp<%10> vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; ; define void @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: @test( diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -26,7 +26,7 @@ ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<0\>, ir\<1\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, vp\<[[STEPS]]\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + -; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + +; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>) (using vector intrinsic)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, vp\<[[STEPS]]\>\l" + ; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT:%.+]]\> = VF * UF +(nuw) vp\<[[CAN_IV]]\>\l" + diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -9,6 +9,8 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanTransforms.h" #include "VPlanTestBase.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "gtest/gtest.h" #include @@ -133,11 +135,12 @@ )"; EXPECT_EQ(ExpectedStr, FullDump); #endif - + TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple())); + TargetLibraryInfo TLI(TLII); SmallPtrSet DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( LI->getLoopFor(LoopHeader), Plan, [](PHINode *P) { return nullptr; }, - DeadInstructions, *SE); + DeadInstructions, *SE, TLI); } TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { @@ -165,9 +168,11 @@ auto Plan = buildHCFG(LoopHeader); SmallPtrSet DeadInstructions; + TargetLibraryInfoImpl TLII(Triple(M.getTargetTriple())); + TargetLibraryInfo TLI(TLII); VPlanTransforms::VPInstructionsToVPRecipes( LI->getLoopFor(LoopHeader), Plan, [](PHINode *P) { return nullptr; }, - DeadInstructions, *SE); + DeadInstructions, *SE, TLI); VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); EXPECT_NE(nullptr, Entry->getSingleSuccessor()); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -806,7 +806,7 @@ SmallVector Args; Args.push_back(&Op1); Args.push_back(&Op2); - VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end())); + VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1065,7 +1065,8 @@ SmallVector Args; Args.push_back(&Op1); Args.push_back(&Op2); - VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end())); + VPWidenCallRecipe Recipe(*Call, make_range(Args.begin(), Args.end()), + false); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory());