diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -752,13 +752,16 @@ /// extracted from vectors. InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) const; + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const; /// Estimate the overhead of scalarizing an instructions unique /// non-constant operands. The (potentially vector) types to use for each of /// argument are passes via Tys. - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) const; + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to @@ -1193,6 +1196,7 @@ /// case is to provision the cost of vectorization/scalarization in /// vectorizer passes. InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index = -1, Value *Op0 = nullptr, Value *Op1 = nullptr) const; @@ -1203,6 +1207,7 @@ /// A typical suitable use case is cost estimation when vector instruction /// exists (e.g., from basic blocks during transformation). InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index = -1) const; /// \return The cost of replication shuffle of \p VF elements typed \p EltTy @@ -1675,11 +1680,12 @@ virtual bool useColdCCForColdCall(Function &F) = 0; virtual InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, - bool Extract) = 0; + bool Insert, bool Extract, + TargetCostKind CostKind) = 0; virtual InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) = 0; + ArrayRef Tys, + TargetCostKind CostKind) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool supportsTailCalls() = 0; virtual bool supportsTailCallFor(const CallBase *CB) = 0; @@ -1787,9 +1793,11 @@ TTI::TargetCostKind CostKind, const Instruction *I) = 0; virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) = 0; virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) = 0; virtual InstructionCost @@ -2150,13 +2158,16 @@ InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) override { - return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + bool Insert, bool Extract, + TargetCostKind CostKind) override { + return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) override { - return Impl.getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Tys, + TargetCostKind CostKind) override { + return Impl.getOperandsScalarizationOverhead(Args, Tys, CostKind); } bool supportsEfficientVectorElementLoadStore() override { @@ -2360,13 +2371,16 @@ const Instruction *I) override { return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) override { - return Impl.getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, + Value *Op1) override { + return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) override { - return Impl.getVectorInstrCost(I, Val, Index); + return Impl.getVectorInstrCost(I, Val, CostKind, Index); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -333,12 +333,15 @@ InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract) const { + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const { return 0; } - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) const { + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) const { return 0; } @@ -585,12 +588,15 @@ return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) const { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, + Value *Op1) const { return 1; } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) const { return 1; } @@ -1176,7 +1182,7 @@ if (auto *CI = dyn_cast(IE->getOperand(2))) if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); - return TargetTTI->getVectorInstrCost(*IE, Ty, Idx); + return TargetTTI->getVectorInstrCost(*IE, Ty, CostKind, Idx); } case Instruction::ShuffleVector: { auto *Shuffle = dyn_cast(U); @@ -1272,7 +1278,7 @@ if (CI->getValue().getActiveBits() <= 32) Idx = CI->getZExtValue(); Type *DstTy = U->getOperand(0)->getType(); - return TargetTTI->getVectorInstrCost(*EEI, DstTy, Idx); + return TargetTTI->getVectorInstrCost(*EEI, DstTy, CostKind, Idx); } } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -86,23 +86,25 @@ /// Estimate a cost of Broadcast as an extract and sequence of insert /// operations. - InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { + InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, + TTI::TargetCostKind CostKind) { InstructionCost Cost = 0; // Broadcast cost is equal to the cost of extracting the zero'th element // plus the cost of inserting it into every element of the result vector. - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, 0, nullptr, nullptr); for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of shuffle as a sequence of extract and insert /// operations. - InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { + InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, + TTI::TargetCostKind CostKind) { InstructionCost Cost = 0; // Shuffle cost is equal to the cost of extracting element from its argument // plus the cost of inserting them onto the result vector. @@ -112,18 +114,20 @@ // vector and finally index 3 of second vector and insert them at index // <0,1,2,3> of result vector. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i, - nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, + CostKind, i, nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of subvector extraction as a sequence of extract and /// insert operations. - InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, - FixedVectorType *SubVTy) { + InstructionCost getExtractSubvectorOverhead(VectorType *VTy, + TTI::TargetCostKind CostKind, + int Index, + FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only extract subvectors from vectors"); int NumSubElts = SubVTy->getNumElements(); @@ -137,18 +141,21 @@ // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, - i + Index, nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i, - nullptr, nullptr); + Cost += + thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, + CostKind, i + Index, nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, + CostKind, i, nullptr, nullptr); } return Cost; } /// Estimate a cost of subvector insertion as a sequence of extract and /// insert operations. - InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, - FixedVectorType *SubVTy) { + InstructionCost getInsertSubvectorOverhead(VectorType *VTy, + TTI::TargetCostKind CostKind, + int Index, + FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only insert subvectors into vectors"); int NumSubElts = SubVTy->getNumElements(); @@ -163,9 +170,10 @@ // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, - i, nullptr, nullptr); - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, - i + Index, nullptr, nullptr); + CostKind, i, nullptr, nullptr); + Cost += + thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, + i + Index, nullptr, nullptr); } return Cost; } @@ -216,7 +224,7 @@ FixedVectorType::get( PointerType::get(VT->getElementType(), 0), VT->getNumElements()), - -1, nullptr, nullptr) + CostKind, -1, nullptr, nullptr) : 0; InstructionCost LoadCost = VT->getNumElements() * @@ -224,8 +232,9 @@ getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); // Next, compute the cost of packing the result in a vector. - InstructionCost PackingCost = getScalarizationOverhead( - VT, Opcode != Instruction::Store, Opcode == Instruction::Store); + InstructionCost PackingCost = + getScalarizationOverhead(VT, Opcode != Instruction::Store, + Opcode == Instruction::Store, CostKind); InstructionCost ConditionalCost = 0; if (VariableMask) { @@ -241,7 +250,7 @@ Instruction::ExtractElement, FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VT->getNumElements()), - -1, nullptr, nullptr) + + CostKind, -1, nullptr, nullptr) + getCFInstrCost(Instruction::Br, CostKind) + getCFInstrCost(Instruction::PHI, CostKind)); } @@ -710,7 +719,8 @@ /// extracted from vectors. InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, - bool Insert, bool Extract) { + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { /// FIXME: a bitfield is not a reasonable abstraction for talking about /// which elements are needed from a scalable vector if (isa(InTy)) @@ -726,11 +736,11 @@ if (!DemandedElts[i]) continue; if (Insert) - Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, + CostKind, i, nullptr, nullptr); if (Extract) - Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i, - nullptr, nullptr); + Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, i, nullptr, nullptr); } return Cost; @@ -738,20 +748,24 @@ /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, - bool Extract) { + bool Extract, + TTI::TargetCostKind CostKind) { if (isa(InTy)) return InstructionCost::getInvalid(); auto *Ty = cast(InTy); APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); - return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } /// Estimate the overhead of scalarizing an instructions unique /// non-constant operands. The (potentially vector) types to use for each of /// argument are passes via Tys. - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) { + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind) { assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); InstructionCost Cost = 0; @@ -766,7 +780,8 @@ if (!isa(A) && UniqueOperands.insert(A).second) { if (auto *VecTy = dyn_cast(Ty)) - Cost += getScalarizationOverhead(VecTy, false, true); + Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -779,14 +794,17 @@ /// added as a heuristic. InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef Args, - ArrayRef Tys) { - InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); + ArrayRef Tys, + TTI::TargetCostKind CostKind) { + InstructionCost Cost = getScalarizationOverhead( + RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Tys); + Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. - Cost += getScalarizationOverhead(RetTy, false, true); + Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, + /*Extract*/ true, CostKind); return Cost; } @@ -898,7 +916,7 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. SmallVector Tys(Args.size(), Ty); - return getScalarizationOverhead(VTy, Args, Tys) + + return getScalarizationOverhead(VTy, Args, Tys, CostKind) + VTy->getNumElements() * Cost; } @@ -951,7 +969,7 @@ switch (improveShuffleKindFromMask(Kind, Mask)) { case TTI::SK_Broadcast: if (auto *FVT = dyn_cast(Tp)) - return getBroadcastShuffleOverhead(FVT); + return getBroadcastShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_Select: case TTI::SK_Splice: @@ -960,13 +978,13 @@ case TTI::SK_PermuteSingleSrc: case TTI::SK_PermuteTwoSrc: if (auto *FVT = dyn_cast(Tp)) - return getPermuteShuffleOverhead(FVT); + return getPermuteShuffleOverhead(FVT, CostKind); return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: - return getExtractSubvectorOverhead(Tp, Index, + return getExtractSubvectorOverhead(Tp, CostKind, Index, cast(SubTp)); case TTI::SK_InsertSubvector: - return getInsertSubvectorOverhead(Tp, Index, + return getInsertSubvectorOverhead(Tp, CostKind, Index, cast(SubTp)); } llvm_unreachable("Unknown TTI::ShuffleKind"); @@ -1110,7 +1128,9 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; + return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, + CostKind) + + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. @@ -1119,8 +1139,12 @@ // that the conversion is scalarized in one way or another. if (Opcode == Instruction::BitCast) { // Illegal bitcasts are done by storing and loading from a stack slot. - return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + - (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); + return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, + /*Extract*/ true, CostKind) + : 0) + + (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + : 0); } llvm_unreachable("Unhandled cast"); @@ -1128,11 +1152,11 @@ InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index, nullptr, nullptr) + + CostKind, Index, nullptr, nullptr) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), - TTI::CastContextHint::None, - TTI::TCK_RecipThroughput); + TTI::CastContextHint::None, CostKind); } InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, @@ -1183,19 +1207,23 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; + return getScalarizationOverhead(ValVTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + + Num * Cost; } // Unknown scalar opcode. return 1; } - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1) { + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1) { return getRegUsageForType(Val->getScalarType()); } InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index) { Value *Op0 = nullptr; Value *Op1 = nullptr; @@ -1203,7 +1231,8 @@ Op0 = IE->getOperand(0); Op1 = IE->getOperand(1); } - return thisT()->getVectorInstrCost(I.getOpcode(), Val, Index, Op0, Op1); + return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, + Op1); } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, @@ -1231,10 +1260,10 @@ APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, /*Insert*/ false, - /*Extract*/ true); - Cost += - thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, - /*Insert*/ true, /*Extract*/ false); + /*Extract*/ true, CostKind); + Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, + /*Insert*/ true, + /*Extract*/ false, CostKind); return Cost; } @@ -1275,9 +1304,9 @@ if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. // We must account for the cost of building or decomposing the vector. - Cost += getScalarizationOverhead(cast(Src), - Opcode != Instruction::Store, - Opcode == Instruction::Store); + Cost += getScalarizationOverhead( + cast(Src), Opcode != Instruction::Store, + Opcode == Instruction::Store, CostKind); } } @@ -1389,13 +1418,13 @@ // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. - InstructionCost InsSubCost = - thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, - /*Insert*/ true, /*Extract*/ false); + InstructionCost InsSubCost = thisT()->getScalarizationOverhead( + SubVT, DemandedAllSubElts, + /*Insert*/ true, /*Extract*/ false, CostKind); Cost += Indices.size() * InsSubCost; - Cost += - thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, - /*Insert*/ false, /*Extract*/ true); + Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, + /*Insert*/ false, + /*Extract*/ true, CostKind); } else { // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. @@ -1410,13 +1439,13 @@ // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. - InstructionCost ExtSubCost = - thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, - /*Insert*/ false, /*Extract*/ true); + InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( + SubVT, DemandedAllSubElts, + /*Insert*/ false, /*Extract*/ true, CostKind); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, - /*Extract*/ false); + /*Extract*/ false, CostKind); } if (!UseMaskForCond) @@ -1649,10 +1678,11 @@ if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) - ScalarizationCost += - getScalarizationOverhead(cast(RetTy), true, false); + ScalarizationCost += getScalarizationOverhead( + cast(RetTy), + /*Insert*/ true, /*Extract*/ false, CostKind); ScalarizationCost += - getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); + getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, @@ -1704,7 +1734,8 @@ Type *ScalarRetTy = RetTy; if (auto *RetVTy = dyn_cast(RetTy)) { if (!SkipScalarizationCost) - ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); + ScalarizationCost = getScalarizationOverhead( + RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); ScalarCalls = std::max(ScalarCalls, cast(RetVTy)->getNumElements()); ScalarRetTy = RetTy->getScalarType(); @@ -1714,7 +1745,8 @@ Type *Ty = Tys[i]; if (auto *VTy = dyn_cast(Ty)) { if (!SkipScalarizationCost) - ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarizationCost += getScalarizationOverhead( + VTy, /*Insert*/ false, /*Extract*/ true, CostKind); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); Ty = Ty->getScalarType(); @@ -2124,8 +2156,10 @@ return InstructionCost::getInvalid(); InstructionCost ScalarizationCost = - SkipScalarizationCost ? ScalarizationCostPassed - : getScalarizationOverhead(RetVTy, true, false); + SkipScalarizationCost + ? ScalarizationCostPassed + : getScalarizationOverhead(RetVTy, /*Insert*/ true, + /*Extract*/ false, CostKind); unsigned ScalarCalls = cast(RetVTy)->getNumElements(); SmallVector ScalarTys; @@ -2141,7 +2175,8 @@ for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (auto *VTy = dyn_cast(Tys[i])) { if (!ICA.skipScalarizationCost()) - ScalarizationCost += getScalarizationOverhead(VTy, false, true); + ScalarizationCost += getScalarizationOverhead( + VTy, /*Insert*/ false, /*Extract*/ true, CostKind); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); } @@ -2258,8 +2293,8 @@ ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } /// Try to calculate the cost of performing strict (in-order) reductions, @@ -2286,8 +2321,8 @@ return InstructionCost::getInvalid(); auto *VTy = cast(Ty); - InstructionCost ExtractCost = - getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); + InstructionCost ExtractCost = getScalarizationOverhead( + VTy, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost ArithCost = thisT()->getArithmeticInstrCost( Opcode, VTy->getElementType(), CostKind); ArithCost *= VTy->getNumElements(); @@ -2366,8 +2401,8 @@ // The last min/max should be in vector registers and we counted it above. // So just need a single extractelement. return ShuffleCost + MinMaxCost + - thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -513,16 +513,17 @@ return TTIImpl->useColdCCForColdCall(F); } -InstructionCost -TargetTransformInfo::getScalarizationOverhead(VectorType *Ty, - const APInt &DemandedElts, - bool Insert, bool Extract) const { - return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); +InstructionCost TargetTransformInfo::getScalarizationOverhead( + VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, + TTI::TargetCostKind CostKind) const { + return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, ArrayRef Tys) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Args, ArrayRef Tys, + TTI::TargetCostKind CostKind) const { + return TTIImpl->getOperandsScalarizationOverhead(Args, Tys, CostKind); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { @@ -898,23 +899,25 @@ } InstructionCost TargetTransformInfo::getVectorInstrCost( - unsigned Opcode, Type *Val, unsigned Index, Value *Op0, Value *Op1) const { + unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, + Value *Op0, Value *Op1) const { // FIXME: Assert that Opcode is either InsertElement or ExtractElement. // This is mentioned in the interface description and respected by all // callers, but never asserted upon. InstructionCost Cost = - TTIImpl->getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + TTIImpl->getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } -InstructionCost TargetTransformInfo::getVectorInstrCost(const Instruction &I, - Type *Val, - unsigned Index) const { +InstructionCost +TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) const { // FIXME: Assert that Opcode is either InsertElement or ExtractElement. // This is mentioned in the interface description and respected by all // callers, but never asserted upon. - InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, Index); + InstructionCost Cost = TTIImpl->getVectorInstrCost(I, Val, CostKind, Index); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7383,11 +7383,11 @@ // The scalar chain of computation has to pay for the transition // scalar to vector. // The vector chain has to account for the combining cost. - InstructionCost ScalarCost = - TTI.getVectorInstrCost(*Transition, PromotedType, Index); - InstructionCost VectorCost = StoreExtractCombineCost; enum TargetTransformInfo::TargetCostKind CostKind = TargetTransformInfo::TCK_RecipThroughput; + InstructionCost ScalarCost = + TTI.getVectorInstrCost(*Transition, PromotedType, CostKind, Index); + InstructionCost VectorCost = StoreExtractCombineCost; for (const auto &Inst : InstsToBePromoted) { // Compute the cost. // By construction, all instructions being promoted are arithmetic ones. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -169,9 +169,11 @@ InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index); InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2131,14 +2131,14 @@ // Get the cost for the extract. We compute the cost (if any) for the extend // below. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, - Index, nullptr, nullptr); + CostKind, Index, nullptr, nullptr); // Legalize the types. auto VecLT = getTypeLegalizationCost(VecTy); auto DstVT = TLI->getValueType(DL, Dst); auto SrcVT = TLI->getValueType(DL, Src); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; // If the resulting type is still a vector and the destination type is legal, // we may get the extension for free. If not, get the default cost for the @@ -2225,13 +2225,16 @@ } InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, - Type *Val, unsigned Index) { + Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index) { return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -162,6 +162,7 @@ using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -790,6 +790,7 @@ } InstructionCost GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { switch (Opcode) { @@ -800,7 +801,8 @@ if (EltSize < 32) { if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) return 0; - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, + Op1); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -811,7 +813,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } } diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h @@ -62,6 +62,7 @@ const Instruction *I = nullptr); using BaseT::getVectorInstrCost; InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); }; diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp @@ -108,6 +108,7 @@ } InstructionCost R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { switch (Opcode) { @@ -116,7 +117,8 @@ unsigned EltSize = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); if (EltSize < 32) { - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, + Op1); } // Extracts are just reads of a subregister, so are free. Inserts are @@ -127,7 +129,7 @@ return Index == ~0u ? 2 : 0; } default: - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -240,8 +240,9 @@ const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -874,6 +874,7 @@ } InstructionCost ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { // Penalize inserting into an D-subregister. We end up with a three times @@ -894,7 +895,8 @@ if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32) return std::max( - BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1), 2U); + BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1), + 2U); } if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || @@ -907,7 +909,7 @@ return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } - return BaseT::getVectorInstrCost(Opcode, ValTy, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1); } InstructionCost ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, @@ -1021,12 +1023,14 @@ if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) { // One scalaization insert, one scalarization extract and the cost of the // fcmps. - return BaseT::getScalarizationOverhead(VecValTy, false, true) + - BaseT::getScalarizationOverhead(VecCondTy, true, false) + + return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false, + /*Extract*/ true, CostKind) + + BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, + /*Extract*/ false, CostKind) + VecValTy->getNumElements() * getCmpSelInstrCost(Opcode, ValTy->getScalarType(), - VecCondTy->getScalarType(), VecPred, CostKind, - I); + VecCondTy->getScalarType(), VecPred, + CostKind, I); } std::pair LT = getTypeLegalizationCost(ValTy); @@ -1039,7 +1043,8 @@ if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) { if (LT.first > 1) return LT.first * BaseCost + - BaseT::getScalarizationOverhead(VecCondTy, true, false); + BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true, + /*Extract*/ false, CostKind); return BaseCost; } } @@ -1442,7 +1447,8 @@ // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. SmallVector Tys(Args.size(), Ty); - return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; + return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) + + Num * Cost; } return BaseCost; @@ -1581,8 +1587,11 @@ // The scalarization cost should be a lot higher. We use the number of vector // elements plus the scalarization overhead. InstructionCost ScalarCost = - NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, true, false) + - BaseT::getScalarizationOverhead(VTy, false, true); + NumElems * LT.first + + BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false, + CostKind) + + BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true, + CostKind); if (EltSize < 8 || Alignment < EltSize / 8) return ScalarCost; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -107,9 +107,12 @@ InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract); - InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys); + bool Insert, bool Extract, + TTI::TargetCostKind CostKind); + InstructionCost + getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys, + TTI::TargetCostKind CostKind); InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind); @@ -154,8 +157,9 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -139,14 +139,17 @@ } InstructionCost HexagonTTIImpl::getScalarizationOverhead( - VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract) { - return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); + VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { + return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); } InstructionCost HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef Args, - ArrayRef Tys) { - return BaseT::getOperandsScalarizationOverhead(Args, Tys); + ArrayRef Tys, + TTI::TargetCostKind CostKind) { + return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind); } InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, @@ -329,6 +332,7 @@ } InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { Type *ElemTy = Val->isVectorTy() ? cast(Val)->getElementType() @@ -339,8 +343,8 @@ if (ElemTy->isIntegerTy(32)) return Cost; // If it's not a 32-bit value, there will need to be an extract. - return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, Index, - Op0, Op1); + return Cost + getVectorInstrCost(Instruction::ExtractElement, Val, CostKind, + Index, Op0, Op1); } if (Opcode == Instruction::ExtractElement) diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -126,8 +126,9 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -675,6 +675,7 @@ } InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { assert(Val->isVectorTy() && "This must be a vector type"); @@ -687,7 +688,7 @@ return InstructionCost::getMax(); InstructionCost Cost = - BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); Cost *= CostFactor; if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) { @@ -829,8 +830,8 @@ if (Src->isVectorTy() && Opcode == Instruction::Store) for (int i = 0, e = cast(Src)->getNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i, nullptr, - nullptr); + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, CostKind, i, + nullptr, nullptr); return Cost; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -157,8 +157,9 @@ const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1198,13 +1198,14 @@ } InstructionCost RISCVTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { assert(Val->isVectorTy() && "This must be a vector type"); if (Opcode != Instruction::ExtractElement && Opcode != Instruction::InsertElement) - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); // Legalize the type. std::pair LT = getTypeLegalizationCost(Val); @@ -1218,7 +1219,7 @@ return LT.first; if (!isTypeLegal(Val)) - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); // In RVV, we could use vslidedown + vmv.x.s to extract element from vector // and vslideup + vmv.s.x to insert element to vector. diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -107,8 +107,9 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -532,7 +532,8 @@ return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); if (DivRemConst) { SmallVector Tys(Args.size(), Ty); - return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys); + return VF * DivMulSeqCost + + getScalarizationOverhead(VTy, Args, Tys, CostKind); } if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer @@ -558,7 +559,8 @@ getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); SmallVector Tys(Args.size(), Ty); InstructionCost Cost = - (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys); + (VF * ScalarCost) + + getScalarizationOverhead(VTy, Args, Tys, CostKind); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -576,8 +578,8 @@ // There is no native support for FRem. if (Opcode == Instruction::FRem) { SmallVector Tys(Args.size(), Ty); - InstructionCost Cost = - (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys); + InstructionCost Cost = (VF * LIBCALL_COST) + + getScalarizationOverhead(VTy, Args, Tys, CostKind); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; @@ -865,8 +867,10 @@ (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI)) NeedsExtracts = false; - TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts); - TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false); + TotCost += getScalarizationOverhead(SrcVecTy, /*Insert*/ false, + NeedsExtracts, CostKind); + TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, + /*Extract*/ false, CostKind); // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4. if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32) @@ -878,7 +882,8 @@ if (Opcode == Instruction::FPTrunc) { if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements. return VF /*ldxbr/lexbr*/ + - getScalarizationOverhead(DstVecTy, true, false); + getScalarizationOverhead(DstVecTy, /*Insert*/ true, + /*Extract*/ false, CostKind); else // double -> float return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/); } @@ -891,7 +896,8 @@ return VF * 2; } // -> fp128. VF * lxdb/lxeb + extraction of elements. - return VF + getScalarizationOverhead(SrcVecTy, false, true); + return VF + getScalarizationOverhead(SrcVecTy, /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -996,6 +1002,7 @@ } InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { // vlvgp will insert two grs into a vector register, so only count half the @@ -1013,7 +1020,7 @@ return Cost; } - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } // Check if a load may be folded as a memory operand in its user. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -66,8 +66,9 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); /// @} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -80,12 +80,12 @@ return Cost; } -InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, - Type *Val, - unsigned Index, - Value *Op0, Value *Op1) { - InstructionCost Cost = - BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index, Op0, Op1); +InstructionCost +WebAssemblyTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1) { + InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost( + Opcode, Val, CostKind, Index, Op0, Op1); // SIMD128's insert/extract currently only take constant indices. if (Index == -1u) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -147,11 +147,13 @@ TTI::TargetCostKind CostKind, const Instruction *I = nullptr); using BaseT::getVectorInstrCost; - InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, - Value *Op0, Value *Op1); + InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, + unsigned Index, Value *Op0, Value *Op1); InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, - bool Insert, bool Extract); + bool Insert, bool Extract, + TTI::TargetCostKind CostKind); InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4257,6 +4257,7 @@ } InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1) { static const CostTblEntry SLMCostTbl[] = { @@ -4269,7 +4270,6 @@ assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); InstructionCost RegisterFileMoveCost = 0; - TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. @@ -4401,14 +4401,14 @@ if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy()) RegisterFileMoveCost += 1; - return BaseT::getVectorInstrCost(Opcode, Val, Index, Op0, Op1) + + return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + RegisterFileMoveCost; } -InstructionCost X86TTIImpl::getScalarizationOverhead(VectorType *Ty, - const APInt &DemandedElts, - bool Insert, - bool Extract) { +InstructionCost +X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, + bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { assert(DemandedElts.getBitWidth() == cast(Ty)->getNumElements() && "Vector size mismatch"); @@ -4416,7 +4416,6 @@ std::pair LT = getTypeLegalizationCost(Ty); MVT MScalarTy = LT.second.getScalarType(); unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); - TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; InstructionCost Cost = 0; constexpr unsigned LaneBitWidth = 128; @@ -4436,8 +4435,8 @@ // For types we can insert directly, insertion into 128-bit sub vectors is // cheap, followed by a cheap chain of concatenations. if (LegalVectorBitWidth <= LaneBitWidth) { - Cost += - BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false); + Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, + /*Extract*/ false, CostKind); } else { // In each 128-lane, if at least one index is demanded but not all // indices are demanded and this 128-lane is not the first 128-lane of @@ -4477,7 +4476,7 @@ Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, I * NumEltsPerLane, LaneTy); Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, Insert, - false); + /*Extract*/ false, CostKind); } APInt AffectedLanes = @@ -4554,8 +4553,8 @@ continue; Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, CostKind, I * NumEltsPerLane, LaneTy); - Cost += BaseT::getScalarizationOverhead(LaneTy, LaneEltMask, false, - Extract); + Cost += BaseT::getScalarizationOverhead( + LaneTy, LaneEltMask, /*Insert*/ false, Extract, CostKind); } return Cost; @@ -4563,7 +4562,8 @@ } // Fallback to default extraction. - Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract); + Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ false, + Extract, CostKind); } return Cost; @@ -4815,7 +4815,7 @@ CoalescedVecEltIdx, CoalescedVecEltIdx + 1); assert(DemandedElts.countPopulation() == 1 && "Inserting single value"); Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad, - !IsLoad); + !IsLoad, CostKind); } // This isn't exactly right. We're using slow unaligned 32-byte accesses @@ -4856,15 +4856,15 @@ (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { // Scalarization APInt DemandedElts = APInt::getAllOnes(NumElem); - InstructionCost MaskSplitCost = - getScalarizationOverhead(MaskTy, DemandedElts, false, true); + InstructionCost MaskSplitCost = getScalarizationOverhead( + MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind); InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - InstructionCost ValueSplitCost = - getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore); + InstructionCost ValueSplitCost = getScalarizationOverhead( + SrcVTy, DemandedElts, IsLoad, IsStore, CostKind); InstructionCost MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace, CostKind); @@ -5174,8 +5174,8 @@ } // Add the final extract element to the cost. - return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } InstructionCost X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, @@ -5475,8 +5475,8 @@ } // Add the final extract element to the cost. - return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0, - nullptr, nullptr); + return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, + CostKind, 0, nullptr, nullptr); } /// Calculate the cost of materializing a 64-bit value. This helper @@ -5781,7 +5781,7 @@ auto *MaskTy = FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF); MaskUnpackCost = getScalarizationOverhead( - MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true); + MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); InstructionCost ScalarCompareCost = getCmpSelInstrCost( Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr, CmpInst::BAD_ICMP_PREDICATE, CostKind); @@ -5791,7 +5791,7 @@ InstructionCost AddressUnpackCost = getScalarizationOverhead( FixedVectorType::get(ScalarTy->getPointerTo(), VF), DemandedElts, - /*Insert=*/false, /*Extract=*/true); + /*Insert=*/false, /*Extract=*/true, CostKind); // The cost of the scalar loads/stores. InstructionCost MemoryOpCost = @@ -5800,10 +5800,10 @@ // The cost of forming the vector from loaded scalars/ // scalarizing the vector to perform scalar stores. - InstructionCost InsertExtractCost = - getScalarizationOverhead(cast(SrcVTy), DemandedElts, - /*Insert=*/Opcode == Instruction::Load, - /*Extract=*/Opcode == Instruction::Store); + InstructionCost InsertExtractCost = getScalarizationOverhead( + cast(SrcVTy), DemandedElts, + /*Insert=*/Opcode == Instruction::Load, + /*Extract=*/Opcode == Instruction::Store, CostKind); return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1683,8 +1683,8 @@ /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, - ElementCount VF) const; + InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, + TTI::TargetCostKind CostKind) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. @@ -3443,8 +3443,9 @@ // to be vectors, so we need to extract individual elements from there, // execute VF scalar calls, and then gather the result into the vector return // value. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost ScalarCallCost = - TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind); if (VF.isScalar()) return ScalarCallCost; @@ -3455,7 +3456,8 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, VF, CostKind); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; @@ -3471,7 +3473,7 @@ // If the corresponding vector cost is cheaper, return its cost. InstructionCost VectorCallCost = - TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput); + TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind); if (VectorCallCost < Cost) { NeedToScalarize = false; Cost = VectorCallCost; @@ -4478,7 +4480,7 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - ScalarizationCost += getScalarizationOverhead(I, VF); + ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -6239,13 +6241,14 @@ // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), true, false); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, + /*Extract*/ false, CostKind); ScalarCost += - VF.getFixedValue() * - TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput); + VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind); } // Compute the scalarization overhead of needed extractelement @@ -6261,7 +6264,8 @@ else if (needsExtract(J, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), - APInt::getAllOnes(VF.getFixedValue()), false, true); + APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, + /*Extract*/ true, CostKind); } } @@ -6390,14 +6394,15 @@ // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; const Align Alignment = getLoadStoreAlignment(I); - Cost += VF.getKnownMinValue() * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS, TTI::TCK_RecipThroughput); + Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + Alignment, AS, CostKind); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF); + Cost += getScalarizationOverhead(I, VF, CostKind); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale @@ -6410,8 +6415,8 @@ VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF); Cost += TTI.getScalarizationOverhead( Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()), - /*Insert=*/false, /*Extract=*/true); - Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); + /*Insert=*/false, /*Extract=*/true, CostKind); + Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); if (useEmulatedMaskMemRefHack(I, VF)) // Artificially setting to a high enough value to practically disable @@ -6477,7 +6482,7 @@ (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, - VF.getKnownMinValue() - 1)); + CostKind, VF.getKnownMinValue() - 1)); } InstructionCost @@ -6772,9 +6777,8 @@ return VectorizationCostTy(C, TypeNotScalarized); } -InstructionCost -LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) const { +InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( + Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6789,8 +6793,9 @@ if (!RetTy->isVoidTy() && (!isa(I) || !TTI.supportsEfficientVectorElementLoadStore())) Cost += TTI.getScalarizationOverhead( - cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true, - false); + cast(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), + /*Insert*/ true, + /*Extract*/ false, CostKind); // Some targets keep addresses scalar. if (isa(I) && !TTI.prefersVectorizedAddressing()) @@ -6810,7 +6815,7 @@ for (auto *V : filterExtractingOperands(Ops, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys); + filterExtractingOperands(Ops, VF), Tys, CostKind); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { @@ -7067,7 +7072,8 @@ VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF); return ( TTI.getScalarizationOverhead( - Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) + + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), + /*Insert*/ false, /*Extract*/ true, CostKind) + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue())); } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar()) // The back-edge branch will remain, as will all scalar branches. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6664,7 +6664,8 @@ continue; } } - Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx); + Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind, + Idx); } // Add a cost for subvector extracts/inserts if required. for (const auto &Data : ExtractVectorsTys) { @@ -6792,7 +6793,7 @@ bool NeedShuffle = VL.front() != *It || !all_of(VL.drop_front(), UndefValue::classof); InstructionCost InsertCost = - TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, + TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, /*Index=*/0, PoisonValue::get(VecTy), *It); return InsertCost + (NeedShuffle ? TTI->getShuffleCost( @@ -7047,7 +7048,7 @@ } } return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy, - *getExtractIndex(I)); + CostKind, *getExtractIndex(I)); }; auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); @@ -7116,7 +7117,8 @@ InstructionCost Cost = 0; Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, - /*Insert*/ true, /*Extract*/ false); + /*Insert*/ true, /*Extract*/ false, + CostKind); // First cost - resize to actual vector size if not identity shuffle or // need to shift the vector. @@ -7995,6 +7997,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); @@ -8004,8 +8007,8 @@ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(), VecTy, EU.Lane); } else { - ExtractCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane); + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + CostKind, EU.Lane); } } @@ -8079,7 +8082,7 @@ EstimateShufflesCost); InstructionCost InsertCost = TTI->getScalarizationOverhead( cast(FirstUsers[I].first->getType()), DemandedElts[I], - /*Insert*/ true, /*Extract*/ false); + /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput); Cost -= InsertCost; } @@ -8427,9 +8430,10 @@ InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const APInt &ShuffledIndices, bool NeedToShuffle) const { + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Cost = TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false); + /*Extract*/ false, CostKind); if (NeedToShuffle) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -230,8 +230,10 @@ InstructionCost OldCost = TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); - OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, - /* Insert */ true, HasExtract); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + OldCost += + TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract, CostKind); // New pattern: load VecPtr InstructionCost NewCost = @@ -346,9 +348,12 @@ return nullptr; Type *VecTy = Ext0->getVectorOperand()->getType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); - InstructionCost Cost0 = TTI.getVectorInstrCost(*Ext0, VecTy, Index0); - InstructionCost Cost1 = TTI.getVectorInstrCost(*Ext1, VecTy, Index1); + InstructionCost Cost0 = + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + InstructionCost Cost1 = + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); // If both costs are invalid no shuffle is needed if (!Cost0.isValid() && !Cost1.isValid()) @@ -411,11 +416,12 @@ // both sequences. unsigned Ext0Index = Ext0IndexC->getZExtValue(); unsigned Ext1Index = Ext1IndexC->getZExtValue(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost Extract0Cost = - TTI.getVectorInstrCost(*Ext0, VecTy, Ext0Index); + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index); InstructionCost Extract1Cost = - TTI.getVectorInstrCost(*Ext1, VecTy, Ext1Index); + TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. // For example, if Ext0 is more expensive: @@ -645,15 +651,16 @@ Mask[Index] = Index + NumElts; Type *ScalarTy = VecTy->getScalarType(); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; InstructionCost OldCost = TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) + - TTI.getVectorInstrCost(I, VecTy, Index); + TTI.getVectorInstrCost(I, VecTy, CostKind, Index); // If the extract has one use, it will be eliminated, so count it in the // original cost. If it has more than one use, ignore the cost because it will // be the same before/after. if (Extract->hasOneUse()) - OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index); + OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index); InstructionCost NewCost = TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) + @@ -801,8 +808,9 @@ // Get cost estimate for the insert element. This cost will factor into // both sequences. - InstructionCost InsertCost = - TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost InsertCost = TTI.getVectorInstrCost( + Instruction::InsertElement, VecTy, CostKind, Index); InstructionCost OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; InstructionCost NewCost = ScalarOpCost + InsertCost + @@ -891,8 +899,10 @@ if (!VecTy) return false; - InstructionCost OldCost = TTI.getVectorInstrCost(*Ext0, VecTy, Index0); - OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, Index1); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost OldCost = + TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0); + OldCost += TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred) * @@ -912,7 +922,7 @@ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); - NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CheapIndex); + NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex); // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. @@ -1169,8 +1179,9 @@ } auto *Index = dyn_cast(UI->getOperand(1)); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; OriginalCost += - TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, + TTI.getVectorInstrCost(Instruction::ExtractElement, FixedVT, CostKind, Index ? Index->getZExtValue() : -1); ScalarizedCost += TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(), diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector-latency.ll @@ -51,7 +51,7 @@ ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> undef, <2 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void