diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1169,6 +1169,19 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Estimate cost of an intrinsic call instruction CI if it were vectorized with + /// factor VF. Return the cost of the instruction, including scalarization + /// overhead if it's needed. + unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF); + + /// Estimate cost of a call instruction CI if it were vectorized with factor VF. + /// Return the cost of the instruction, including scalarization overhead if it's + /// needed. The flag NeedToScalarize shows if the call needs to be scalarized - + // i.e. either vector version isn't available, or is too expensive. + unsigned getVectorCallCost(CallInst *CI, unsigned VF, + bool &NeedToScalarize); + + private: unsigned NumPredStores = 0; @@ -1221,6 +1234,10 @@ /// element) unsigned getUniformMemOpCost(Instruction *I, unsigned VF); + /// Estimate the overhead of scalarizing an instruction. This is a + /// convenience wrapper for the type-based getScalarizationOverhead API. + unsigned getScalarizationOverhead(Instruction *I, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); @@ -3057,44 +3074,7 @@ } } -/// Estimate the overhead of scalarizing an instruction. This is a -/// convenience wrapper for the type-based getScalarizationOverhead API. -static unsigned getScalarizationOverhead(Instruction *I, unsigned VF, - const TargetTransformInfo &TTI) { - if (VF == 1) - return 0; - - unsigned Cost = 0; - Type *RetTy = ToVectorTy(I->getType(), VF); - if (!RetTy->isVoidTy() && - (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore())) - Cost += TTI.getScalarizationOverhead(RetTy, true, false); - - // Some targets keep addresses scalar. - if (isa(I) && !TTI.prefersVectorizedAddressing()) - return Cost; - - if (CallInst *CI = dyn_cast(I)) { - SmallVector Operands(CI->arg_operands()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - else if (!isa(I) || - !TTI.supportsEfficientVectorElementLoadStore()) { - SmallVector Operands(I->operand_values()); - Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); - } - - return Cost; -} - -// Estimate cost of a call instruction CI if it were vectorized with factor VF. -// Return the cost of the instruction, including scalarization overhead if it's -// needed. The flag NeedToScalarize shows if the call needs to be scalarized - -// i.e. either vector version isn't available, or is too expensive. -static unsigned getVectorCallCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI, +unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize) { Function *F = CI->getCalledFunction(); StringRef FnName = CI->getCalledFunction()->getName(); @@ -3118,7 +3098,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - unsigned ScalarizationCost = getScalarizationOverhead(CI, VF, TTI); + unsigned ScalarizationCost = getScalarizationOverhead(CI, VF); unsigned Cost = ScalarCallCost * VF + ScalarizationCost; @@ -3137,12 +3117,7 @@ return Cost; } -// Estimate cost of an intrinsic call instruction CI if it were vectorized with -// factor VF. Return the cost of the instruction, including scalarization -// overhead if it's needed. -static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF, - const TargetTransformInfo &TTI, - const TargetLibraryInfo *TLI) { +unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, unsigned VF) { Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); @@ -4126,9 +4101,10 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = + Cost->getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; assert((UseVectorIntrinsic || !NeedToScalarize) && "Instruction should be scalarized elsewhere."); @@ -5522,7 +5498,7 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated @@ -5674,6 +5650,35 @@ return VectorizationCostTy(C, TypeNotScalarized); } +unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, unsigned VF) { + + if (VF == 1) + return 0; + + unsigned Cost = 0; + Type *RetTy = ToVectorTy(I->getType(), VF); + if (!RetTy->isVoidTy() && + (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore())) + Cost += TTI.getScalarizationOverhead(RetTy, true, false); + + // Some targets keep addresses scalar. + if (isa(I) && !TTI.prefersVectorizedAddressing()) + return Cost; + + if (CallInst *CI = dyn_cast(I)) { + SmallVector Operands(CI->arg_operands()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + else if (!isa(I) || + !TTI.supportsEfficientVectorElementLoadStore()) { + SmallVector Operands(I->operand_values()); + Cost += TTI.getOperandsScalarizationOverhead(Operands, VF); + } + + return Cost; +} + void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; @@ -5914,7 +5919,7 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - Cost += getScalarizationOverhead(I, VF, TTI); + Cost += getScalarizationOverhead(I, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6035,16 +6040,17 @@ case Instruction::Call: { bool NeedToScalarize; CallInst *CI = cast(I); - unsigned CallCost = getVectorCallCost(CI, VF, TTI, TLI, NeedToScalarize); + unsigned CallCost = + getVectorCallCost(CI, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) - return std::min(CallCost, getVectorIntrinsicCost(CI, VF, TTI, TLI)); + return std::min(CallCost, getVectorIntrinsicCost(CI, VF)); return CallCost; } default: // The cost of executing VF copies of the scalar instruction. This opcode // is unknown. Assume that it is the same as 'mul'. return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) + - getScalarizationOverhead(I, VF, TTI); + getScalarizationOverhead(I, VF); } // end of switch. } @@ -6638,9 +6644,10 @@ // version of the instruction. // Is it beneficial to perform intrinsic call compared to lib call? bool NeedToScalarize; - unsigned CallCost = getVectorCallCost(CI, VF, *TTI, TLI, NeedToScalarize); + unsigned CallCost = + CM.getVectorCallCost(CI, VF, NeedToScalarize); bool UseVectorIntrinsic = - ID && getVectorIntrinsicCost(CI, VF, *TTI, TLI) <= CallCost; + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; return UseVectorIntrinsic || !NeedToScalarize; } if (isa(I) || isa(I)) { @@ -6828,7 +6835,7 @@ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); auto Plan = llvm::make_unique(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, TTI, Legal, CM, Builder); + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -29,9 +29,6 @@ /// Target Library Info. const TargetLibraryInfo *TLI; - /// Target Transform Info. - const TargetTransformInfo *TTI; - /// The legality analysis. LoopVectorizationLegality *Legal; @@ -104,10 +101,9 @@ public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, - const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, VPBuilder &Builder) - : OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {} /// Check if a recipe can be create for \p I withing the given VF \p Range.