Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1247,6 +1247,8 @@ /// avoid redundant calculations. void setCostBasedWideningDecision(ElementCount VF); + void setVectorizedCallDecision(ElementCount VF); + /// A struct that represents some properties of the register usage /// of a loop. struct RegisterUsage { @@ -1358,7 +1360,9 @@ CM_Widen_Reverse, // For consecutive accesses with stride -1. CM_Interleave, CM_GatherScatter, - CM_Scalarize + CM_Scalarize, + CM_VectorCall, + CM_IntrinsicCall }; /// Save vectorization decision \p W and \p Cost taken by the cost model for @@ -1414,6 +1418,29 @@ return WideningDecisions[InstOnVF].second; } + struct CallWideningDecision { + InstWidening Kind; + Function *Variant; + Intrinsic::ID IID; + std::optional MaskPos; + InstructionCost Cost; + }; + + void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, + Function *Variant, Intrinsic::ID IID, + std::optional MaskPos, + InstructionCost Cost) { + assert(VF.isVector() && "Expected VF >= 2"); + CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID, + MaskPos, Cost}; + } + + CallWideningDecision getCallWideningDecision(CallInst *CI, + ElementCount VF) const { + assert(VF.isVector() && "Expected VF >= 2"); + return CallWideningDecisions.at(std::make_pair(CI, VF)); + } + /// Return True if instruction \p I is an optimizable truncate whose operand /// is an induction variable. Such a truncate will be removed by adding a new /// induction variable with the destination type. @@ -1452,6 +1479,7 @@ if (VF.isScalar() || Uniforms.contains(VF)) return; setCostBasedWideningDecision(VF); + setVectorizedCallDecision(VF); collectLoopUniforms(VF); collectLoopScalars(VF); } @@ -1629,16 +1657,13 @@ /// Estimate cost of a call instruction CI if it were vectorized with factor /// VF. Return the cost of the instruction, including scalarization overhead - /// if it's needed. The flag NeedToScalarize shows if the call needs to be - /// scalarized - - /// i.e. either vector version isn't available, or is too expensive. - InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, - Function **Variant, - bool *NeedsMask = nullptr) const; + /// if it's needed. + InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const; /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); + CallWideningDecisions.clear(); Uniforms.clear(); Scalars.clear(); } @@ -1705,7 +1730,7 @@ /// part of that pattern. std::optional getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy, - TTI::TargetCostKind CostKind); + TTI::TargetCostKind CostKind) const; /// Calculate vectorization cost of memory instruction \p I. InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); @@ -1830,6 +1855,11 @@ DecisionList WideningDecisions; + using CallDecisionList = + DenseMap, CallWideningDecision>; + + CallDecisionList CallWideningDecisions; + /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. bool needsExtract(Value *V, ElementCount VF) const { @@ -3445,76 +3475,34 @@ } } -InstructionCost LoopVectorizationCostModel::getVectorCallCost( - CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { - Function *F = CI->getCalledFunction(); - Type *ScalarRetTy = CI->getType(); - SmallVector Tys, ScalarTys; - bool MaskRequired = Legal->isMaskRequired(CI); - for (auto &ArgOp : CI->args()) - ScalarTys.push_back(ArgOp->getType()); - - // Estimate cost of scalarized vector call. The source operands are assumed - // to be vectors, so we need to extract individual elements from there, - // execute VF scalar calls, and then gather the result into the vector return - // value. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); - if (VF.isScalar()) - return ScalarCallCost; +InstructionCost +LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, + ElementCount VF) const { + // We only need to calculate a cost if the VF is scalar; for actual vectors + // we should already have a pre-calculated cost at each VF. + if (VF.isScalar()) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + Type *RetTy = CI->getType(); + if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) + return *RedCost; - // Compute corresponding vector type for return value and arguments. - Type *RetTy = ToVectorTy(ScalarRetTy, VF); - for (Type *ScalarTy : ScalarTys) - Tys.push_back(ToVectorTy(ScalarTy, VF)); + SmallVector Tys; + for (auto &ArgOp : CI->args()) + Tys.push_back(ArgOp->getType()); - // Compute costs of unpacking argument values for the scalar calls and - // packing the return values to a vector. - InstructionCost ScalarizationCost = - getScalarizationOverhead(CI, VF, CostKind); + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind); - InstructionCost Cost = - ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; - - // If we can't emit a vector call for this function, then the currently found - // cost is the cost we need to return. - InstructionCost MaskCost = 0; - VFShape Shape = VFShape::get(*CI, VF, MaskRequired); - if (NeedsMask) - *NeedsMask = MaskRequired; - Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - // If we want an unmasked vector function but can't find one matching the VF, - // maybe we can find vector function that does use a mask and synthesize - // an all-true mask. - if (!VecFunc && !MaskRequired) { - Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); - VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); - // If we found one, add in the cost of creating a mask - if (VecFunc) { - if (NeedsMask) - *NeedsMask = true; - MaskCost = TTI.getShuffleCost( - TargetTransformInfo::SK_Broadcast, - VectorType::get( - IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()), - VF)); + // If this is an intrinsic we may have a lower cost for it. + if (getVectorIntrinsicIDForCall(CI, TLI)) { + InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); + return std::min(ScalarCallCost, IntrinsicCost); } + return ScalarCallCost; } - // We don't support masked function calls yet, but we can scalarize a - // masked call with branches (unless VF is scalable). - if (!TLI || CI->isNoBuiltin() || !VecFunc) - return VF.isScalable() ? InstructionCost::getInvalid() : Cost; - - // If the corresponding vector cost is cheaper, return its cost. - InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; - if (VectorCallCost < Cost) { - *Variant = VecFunc; - Cost = VectorCallCost; - } - return Cost; + return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; } static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { @@ -4402,7 +4390,10 @@ default: return true; case Instruction::Call: - return !VFDatabase::hasMaskedVariant(*(cast(I)), VF); + if (VF.isScalar()) + return true; + return CallWideningDecisions.at(std::make_pair(cast(I), VF)) + .Kind == CM_Scalarize; case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); @@ -6115,6 +6106,8 @@ if (ValuesToIgnore.count(I)) continue; + collectInLoopReductions(); + // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { // Count the number of registers used, per register class, given all open @@ -6634,7 +6627,8 @@ std::optional LoopVectorizationCostModel::getReductionPatternCost( - Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) { + Instruction *I, ElementCount VF, Type *Ty, + TTI::TargetCostKind CostKind) const { using namespace llvm::PatternMatch; // Early exit for no inloop reductions if (InLoopReductionChains.empty() || VF.isScalar() || !isa(Ty)) @@ -6672,10 +6666,10 @@ // Find the reduction this chain is a part of and calculate the basic cost of // the reduction on its own. - Instruction *LastChain = InLoopReductionImmediateChains[RetI]; + Instruction *LastChain = InLoopReductionImmediateChains.at(RetI); Instruction *ReductionPhi = LastChain; while (!isa(ReductionPhi)) - ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; + ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi); const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(cast(ReductionPhi))->second; @@ -7093,6 +7087,122 @@ } } +void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) { + if (VF.isScalar()) + return; + + for (BasicBlock *BB : TheLoop->blocks()) { + // For each instruction in the old loop. + for (Instruction &I : *BB) { + CallInst *CI = dyn_cast(&I); + + if (!CI || CallWideningDecisions.contains(std::make_pair(CI, VF))) + continue; + + InstructionCost ScalarCost = InstructionCost::getInvalid(); + InstructionCost VectorCost = InstructionCost::getInvalid(); + InstructionCost IntrinsicCost = InstructionCost::getInvalid(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + Function *ScalarFunc = CI->getCalledFunction(); + Type *ScalarRetTy = CI->getType(); + SmallVector Tys, ScalarTys; + bool MaskRequired = Legal->isMaskRequired(CI); + for (auto &ArgOp : CI->args()) + ScalarTys.push_back(ArgOp->getType()); + + // Compute corresponding vector type for return value and arguments. + Type *RetTy = ToVectorTy(ScalarRetTy, VF); + for (Type *ScalarTy : ScalarTys) + Tys.push_back(ToVectorTy(ScalarTy, VF)); + + // An in-loop reduction using an fmuladd intrinsic is a special case; + // we don't want the normal cost for that intrinsic. + if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) + if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) { + setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr, + getVectorIntrinsicIDForCall(CI, TLI), + std::nullopt, *RedCost); + return; + } + + // Estimate cost of scalarized vector call. The source operands are + // assumed to be vectors, so we need to extract individual elements from + // there, execute VF scalar calls, and then gather the result into the + // vector return value. + InstructionCost ScalarCallCost = + TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind); + + // Compute costs of unpacking argument values for the scalar calls and + // packing the return values to a vector. + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, VF, CostKind); + + ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; + + // Find the cost of vectorizing the call, if we can find a suitable + // vector variant of the function. + InstructionCost MaskCost = 0; + VFShape Shape = VFShape::get(*CI, VF, MaskRequired); + bool UsesMask = MaskRequired; + Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we want an unmasked vector function but can't find one matching the + // VF, maybe we can find vector function that does use a mask and + // synthesize an all-true mask. + if (!VecFunc && !MaskRequired) { + Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true); + VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); + // If we found one, add in the cost of creating a mask + if (VecFunc) { + UsesMask = true; + MaskCost = TTI.getShuffleCost( + TargetTransformInfo::SK_Broadcast, + VectorType::get(IntegerType::getInt1Ty( + VecFunc->getFunctionType()->getContext()), + VF)); + } + } + + std::optional MaskPos = std::nullopt; + if (VecFunc && UsesMask) { + for (const VFInfo &Info : VFDatabase::getMappings(*CI)) + if (Info.Shape == Shape) { + assert(Info.isMasked() && "Vector function info shape mismatch"); + MaskPos = Info.getParamIndexForOptionalMask().value(); + break; + } + + assert(MaskPos.has_value() && "Unable to find mask parameter index"); + } + + if (TLI && VecFunc && !CI->isNoBuiltin()) + VectorCost = + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost; + + // Find the cost of an intrinsic; some targets may have instructions that + // perform the operation without needing an actual call. + Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI); + if (IID != Intrinsic::not_intrinsic) + IntrinsicCost = getVectorIntrinsicCost(CI, VF); + + InstructionCost Cost = ScalarCost; + InstWidening Decision = CM_Scalarize; + + if (VectorCost <= Cost) { + Cost = VectorCost; + Decision = CM_VectorCall; + } + + if (IntrinsicCost <= Cost) { + Cost = IntrinsicCost; + Decision = CM_IntrinsicCall; + } + + setCallWideningDecision(CI, VF, Decision, VecFunc, IID, MaskPos, Cost); + } + } +} + InstructionCost LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *&VectorTy) { @@ -7350,6 +7460,9 @@ return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: llvm_unreachable("Instr did not go through cost modelling?"); + case LoopVectorizationCostModel::CM_VectorCall: + case LoopVectorizationCostModel::CM_IntrinsicCall: + llvm_unreachable_internal("Instr has invalid widening decision"); } llvm_unreachable("Unhandled case!"); @@ -7407,19 +7520,8 @@ return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } - case Instruction::Call: { - if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) - if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind)) - return *RedCost; - Function *Variant; - CallInst *CI = cast(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); - if (getVectorIntrinsicIDForCall(CI, TLI)) { - InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); - return std::min(CallCost, IntrinsicCost); - } - return CallCost; - } + case Instruction::Call: + return getVectorCallCost(cast(I), VF); case Instruction::ExtractValue: return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput); case Instruction::Alloca: @@ -7590,9 +7692,9 @@ "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. + CM.collectInLoopReductions(); if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); - CM.collectInLoopReductions(); buildVPlansWithVPRecipes(UserVF, UserVF); if (!hasPlanWithVF(UserVF)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF @@ -7616,6 +7718,7 @@ ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) VFCandidates.insert(VF); + CM.collectInLoopReductions(); for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7626,7 +7729,6 @@ CM.collectInstsToScalarize(VF); } - CM.collectInLoopReductions(); buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); @@ -8424,22 +8526,15 @@ bool ShouldUseVectorIntrinsic = ID && LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) -> bool { - Function *Variant; - // Is it beneficial to perform intrinsic call compared to lib - // call? - InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, &Variant); - InstructionCost IntrinsicCost = - CM.getVectorIntrinsicCost(CI, VF); - return IntrinsicCost <= CallCost; + return CM.getCallWideningDecision(CI, VF).Kind == + LoopVectorizationCostModel::CM_IntrinsicCall; }, Range); if (ShouldUseVectorIntrinsic) return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID); Function *Variant = nullptr; - ElementCount VariantVF; - bool NeedsMask = false; + std::optional MaskPos; // Is better to call a vectorized version of the function than to to scalarize // the call? auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8458,16 +8553,19 @@ // finds a valid variant. if (Variant) return false; - CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); - // If we found a valid vector variant at this VF, then store the VF - // in case we need to generate a mask. - if (Variant) - VariantVF = VF; - return Variant != nullptr; + LoopVectorizationCostModel::CallWideningDecision Decision = + CM.getCallWideningDecision(CI, VF); + if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) { + Variant = Decision.Variant; + MaskPos = Decision.MaskPos; + return true; + } + + return false; }, Range); if (ShouldUseVectorCall) { - if (NeedsMask) { + if (MaskPos.has_value()) { // We have 2 cases that would require a mask: // 1) The block needs to be predicated, either due to a conditional // in the scalar loop or use of an active lane mask with @@ -8482,17 +8580,7 @@ Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( IntegerType::getInt1Ty(Variant->getFunctionType()->getContext()))); - VFShape Shape = VFShape::get(*CI, VariantVF, /*HasGlobalPred=*/true); - unsigned MaskPos = 0; - - for (const VFInfo &Info : VFDatabase::getMappings(*CI)) - if (Info.Shape == Shape) { - assert(Info.isMasked() && "Vector function info shape mismatch"); - MaskPos = Info.getParamIndexForOptionalMask().value(); - break; - } - - Ops.insert(Ops.begin() + MaskPos, Mask); + Ops.insert(Ops.begin() + *MaskPos, Mask); } return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -6,8 +6,8 @@ target triple="aarch64-unknown-linux-gnu" -; CHECK-VF4: Found an estimated cost of 1 for VF 4 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 2 for VF 8 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) { entry: @@ -28,8 +28,8 @@ } -; CHECK-VF4: Found an estimated cost of 2 for VF 4 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 4 for VF 8 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) { entry: @@ -49,7 +49,7 @@ ret double %add } -; CHECK-VF4: Found an estimated cost of 1 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) +; CHECK-VF4: Found an estimated cost of 19 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) ; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) { @@ -74,8 +74,8 @@ declare float @llvm.fmuladd.f32(float, float, float) -; CHECK-VF4: Found an estimated cost of 4 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) -; CHECK-VF8: Found an estimated cost of 8 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) { entry: