diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1069,10 +1069,11 @@ /// passed through \p Args, which helps improve the cost estimation in some /// cases, like in broadcast loads. /// NOTE: For subvector extractions Tp represents the source type. - InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask = None, int Index = 0, - VectorType *SubTp = nullptr, - ArrayRef Args = None) const; + InstructionCost + getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask = None, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, + int Index = 0, VectorType *SubTp = nullptr, + ArrayRef Args = None) const; /// Represents a hint about the context in which a cast is used. /// @@ -1715,8 +1716,9 @@ OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI = nullptr) = 0; virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, - VectorType *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, VectorType *SubTp, ArrayRef Args) = 0; virtual InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, @@ -2265,10 +2267,11 @@ Opd1PropInfo, Opd2PropInfo, Args, CxtI); } InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) override { - return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp, Args); + return Impl.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -510,7 +510,8 @@ } InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None) const { return 1; @@ -1185,13 +1186,13 @@ if (Shuffle->isExtractSubvectorMask(SubIndex)) return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy, - Shuffle->getShuffleMask(), SubIndex, - VecTy, Operands); + Shuffle->getShuffleMask(), CostKind, + SubIndex, VecTy, Operands); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(), - SubIndex, + CostKind, SubIndex, FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands); @@ -1216,37 +1217,38 @@ if (Shuffle->isReverse()) return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); if (Shuffle->isSelect()) return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); if (Shuffle->isTranspose()) return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); if (Shuffle->isZeroEltSplat()) return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); if (Shuffle->isSingleSource()) return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) return TargetTTI->getShuffleCost( - TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(), SubIndex, - FixedVectorType::get(VecTy->getScalarType(), NumSubElts), Operands); + TTI::SK_InsertSubvector, VecTy, Shuffle->getShuffleMask(), CostKind, + SubIndex, FixedVectorType::get(VecTy->getScalarType(), NumSubElts), + Operands); return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, - Shuffle->getShuffleMask(), 0, nullptr, - Operands); + Shuffle->getShuffleMask(), CostKind, 0, + nullptr, Operands); } case Instruction::ExtractElement: { auto *EEI = dyn_cast(U); diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -932,7 +932,8 @@ } InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None) { @@ -1514,9 +1515,9 @@ if (isa(RetTy)) return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[1])->getZExtValue(); - return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - cast(Args[0]->getType()), None, - Index, cast(RetTy)); + return thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, cast(Args[0]->getType()), + None, CostKind, Index, cast(RetTy)); } case Intrinsic::vector_insert: { // FIXME: Handle case where a scalable vector is inserted into a scalable @@ -1526,18 +1527,18 @@ unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost( TTI::SK_InsertSubvector, cast(Args[0]->getType()), None, - Index, cast(Args[1]->getType())); + CostKind, Index, cast(Args[1]->getType())); } case Intrinsic::experimental_vector_reverse: { return thisT()->getShuffleCost(TTI::SK_Reverse, cast(Args[0]->getType()), None, - 0, cast(RetTy)); + CostKind, 0, cast(RetTy)); } case Intrinsic::experimental_vector_splice: { unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost(TTI::SK_Splice, cast(Args[0]->getType()), None, - Index, cast(RetTy)); + CostKind, Index, cast(RetTy)); } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: @@ -2215,7 +2216,7 @@ NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, - NumVecElts, SubTy); + CostKind, NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; ++LongVectorCount; @@ -2229,8 +2230,9 @@ // architecture-dependent length. // By default reductions need one shuffle per reduction level. - ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( - TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); + ShuffleCost += + NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, + None, CostKind, 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + @@ -2311,8 +2313,8 @@ auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); - ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, - NumVecElts, SubTy); + ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, + None, CostKind, NumVecElts, SubTy); MinMaxCost += thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind) + @@ -2328,8 +2330,9 @@ // operations performed on the current platform. That's why several final // reduction opertions are perfomed on the vectors with the same // architecture-dependent length. - ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( - TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); + ShuffleCost += + NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, + None, CostKind, 0, Ty); MinMaxCost += NumReduxLevels * (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -781,10 +781,11 @@ } InstructionCost TargetTransformInfo::getShuffleCost( - ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, int Index, - VectorType *SubTp, ArrayRef Args) const { + ShuffleKind Kind, VectorType *Ty, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, + ArrayRef Args) const { InstructionCost Cost = - TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp, Args); + TTIImpl->getShuffleCost(Kind, Ty, Mask, CostKind, Index, SubTp, Args); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -369,7 +369,8 @@ TTI::TargetCostKind CostKind); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2818,8 +2818,9 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, - VectorType *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, VectorType *SubTp, ArrayRef Args) { std::pair LT = getTypeLegalizationCost(Tp); // If we have a Mask, and the LT is being legalized somehow, split the Mask @@ -2877,7 +2878,7 @@ if (NumSources <= 2) Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc : TTI::SK_PermuteTwoSrc, - NTp, NMask, 0, nullptr, Args); + NTp, NMask, CostKind, 0, nullptr, Args); else if (any_of(enumerate(NMask), [&](const auto &ME) { return ME.value() % LTNumElts == ME.index(); })) @@ -3027,7 +3028,7 @@ } } - return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } bool AArch64TTIImpl::preferPredicateOverEpilogue( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -199,7 +199,8 @@ InstructionCost getVectorSplitCost() { return 0; } InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1054,6 +1054,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { Kind = improveShuffleKindFromMask(Kind, Mask); @@ -1074,7 +1075,7 @@ } } - return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp); } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -212,7 +212,8 @@ int getNumMemOps(const IntrinsicInst *I) const; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1201,6 +1201,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { Kind = improveShuffleKindFromMask(Kind, Mask); @@ -1301,7 +1302,8 @@ int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) : 1; - return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); + return BaseCost * + BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } InstructionCost ARMTTIImpl::getArithmeticInstrCost( diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -125,7 +125,9 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, Type *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + Type *SubTp, ArrayRef Args = None); InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -223,8 +223,9 @@ } InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, - Type *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, Type *SubTp, ArrayRef Args) { return 1; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -112,7 +112,9 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, Type *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, + Type *SubTp, ArrayRef Args = None); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1003,8 +1003,9 @@ } InstructionCost PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - ArrayRef Mask, int Index, - Type *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, Type *SubTp, ArrayRef Args) { InstructionCost CostFactor = diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -101,7 +101,8 @@ InstructionCost getSpliceCost(VectorType *Tp, int Index); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -180,6 +180,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args) { if (isa(Tp)) { @@ -212,7 +213,7 @@ } } - return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } InstructionCost diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -92,7 +92,8 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -594,8 +594,9 @@ InstructionCost SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, - VectorType *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, VectorType *SubTp, ArrayRef Args) { Kind = improveShuffleKindFromMask(Kind, Mask); if (ST->hasVector()) { @@ -630,7 +631,7 @@ } } - return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp); } // Return the log2 difference of the element sizes of the two vector types. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -135,7 +135,8 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - ArrayRef Mask, int Index, + ArrayRef Mask, + TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef Args = None); InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1085,8 +1085,9 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, - ArrayRef Mask, int Index, - VectorType *SubTp, + ArrayRef Mask, + TTI::TargetCostKind CostKind, + int Index, VectorType *SubTp, ArrayRef Args) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. @@ -1134,8 +1135,9 @@ auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); - InstructionCost ExtractCost = getShuffleCost( - TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); + InstructionCost ExtractCost = + getShuffleCost(TTI::SK_ExtractSubvector, VecTy, None, CostKind, + ExtractIndex, SubTy); // If the original size is 32-bits or more, we can use pshufd. Otherwise // if we have SSSE3 we can use pshufb. @@ -1249,7 +1251,7 @@ InstructionCost Cost = 0; processShuffleMasks( NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, - [this, SingleOpTy, &PrevSrcReg, &PrevRegMask, + [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, &Cost](ArrayRef RegMask, unsigned SrcReg, unsigned DestReg) { if (!ShuffleVectorInst::isIdentityMask(RegMask)) { // Check if the previous register can be just copied to the next @@ -1257,7 +1259,7 @@ if (PrevRegMask.empty() || PrevSrcReg != SrcReg || PrevRegMask != RegMask) Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, - RegMask, 0, nullptr); + RegMask, CostKind, 0, nullptr); else // Just a copy of previous destination register. Cost += TTI::TCC_Basic; @@ -1271,21 +1273,21 @@ PrevSrcReg = SrcReg; PrevRegMask = RegMask; }, - [this, SingleOpTy, &Cost](ArrayRef RegMask, - unsigned /*Unused*/, - unsigned /*Unused*/) { + [this, SingleOpTy, CostKind, &Cost](ArrayRef RegMask, + unsigned /*Unused*/, + unsigned /*Unused*/) { Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask, - 0, nullptr); + CostKind, 0, nullptr); }); return Cost; } InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, - None, 0, nullptr); + None, CostKind, 0, nullptr); } - return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); } // For 2-input shuffles, we must account for splitting the 2 inputs into many. @@ -1648,7 +1650,7 @@ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp); } InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, @@ -3678,6 +3680,7 @@ assert(Val->isVectorTy() && "This must be a vector type"); Type *ScalarType = Val->getScalarType(); InstructionCost RegisterFileMoveCost = 0; + TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; // Non-immediate extraction/insertion can be handled as a sequence of // aliased loads+stores via the stack. @@ -3693,19 +3696,16 @@ // Extract - store vector to stack, load scalar. if (Opcode == Instruction::ExtractElement) { - return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, - TTI::TargetCostKind::TCK_RecipThroughput) + + return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0, - TTI::TargetCostKind::TCK_RecipThroughput); + CostKind); } // Insert - store vector to stack, store scalar, load vector. if (Opcode == Instruction::InsertElement) { - return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, - TTI::TargetCostKind::TCK_RecipThroughput) + + return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) + getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0, - TTI::TargetCostKind::TCK_RecipThroughput) + - getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, - TTI::TargetCostKind::TCK_RecipThroughput); + CostKind) + + getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind); } } @@ -3783,8 +3783,8 @@ EVT VT = TLI->getValueType(DL, Val); if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) SubTy = FixedVectorType::get(ScalarType, SubNumElts); - ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); + ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, CostKind, + 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; @@ -3809,7 +3809,7 @@ std::pair LT = getTypeLegalizationCost(Ty); MVT MScalarTy = LT.second.getScalarType(); unsigned SizeInBits = LT.second.getSizeInBits(); - + TTI::TargetCostKind CostKind = TTI::TargetCostKind::TCK_RecipThroughput; InstructionCost Cost = 0; // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much @@ -3916,7 +3916,7 @@ auto *Ty128 = FixedVectorType::get(Ty->getElementType(), Scale); for (unsigned I = 0; I != Num128Lanes; ++I) if (DemandedUpper128Lanes[I]) - Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + Cost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, CostKind, I * Scale, Ty128); // Add all the demanded element extractions together, but adjust the @@ -4042,8 +4042,8 @@ unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); InstructionCost SingleShuffleCost = - getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, - /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/None, + CostKind, /*Index=*/0, /*SubTp=*/nullptr); return NumDstVectorsDemanded * SingleShuffleCost; } @@ -4172,7 +4172,7 @@ if (!Is0thSubVec) Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector : TTI::ShuffleKind::SK_ExtractSubvector, - VTy, None, NumEltDone(), CurrVecTy); + VTy, None, CostKind, NumEltDone(), CurrVecTy); } // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, @@ -4251,14 +4251,17 @@ if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires extend/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + - getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, CostKind, 0, + nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, CostKind, 0, + nullptr); else if (LT.first * LT.second.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes - Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, CostKind, + 0, MaskTy); } // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. @@ -4503,8 +4506,8 @@ // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - ReductionCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); + ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + CostKind, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -4515,8 +4518,8 @@ else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); - ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + None, CostKind, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -4526,8 +4529,8 @@ else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); - ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + None, CostKind, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -4805,8 +4808,8 @@ // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - MinMaxCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); + MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, CostKind, + NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -4816,8 +4819,8 @@ FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); - MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, + CostKind, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -4825,8 +4828,8 @@ ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); - MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, + CostKind, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -5622,7 +5625,7 @@ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; InstructionCost ShuffleCost = - getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); + getShuffleCost(ShuffleKind, SingleMemOpTy, None, CostKind, 0, nullptr); unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; @@ -5678,8 +5681,8 @@ // There is no strided stores meanwhile. And store can't be folded in // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. - InstructionCost ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); + InstructionCost ShuffleCost = getShuffleCost( + TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, CostKind, 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6405,8 +6405,8 @@ bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, + CostKind, 0); return Cost; } @@ -6463,6 +6463,7 @@ Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); unsigned AS = getLoadStoreAddressSpace(I); + enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto Group = getInterleavedAccessGroup(I); assert(Group && "Fail to get an interleaved access group."); @@ -6482,15 +6483,15 @@ (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); - Cost += - Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); + Cost += Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, + CostKind, 0); } return Cost; } @@ -7036,9 +7037,10 @@ // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) - return TTI.getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), - None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + cast(VectorTy), None, CostKind, + VF.getKnownMinValue() - 1, + FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5895,7 +5895,7 @@ if (Idx + NumElts <= EENumElts) { Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, Idx, VecTy); + EEVTy, None, CostKind, Idx, VecTy); } else { // Need to round up the subvector type vectorization factor to avoid a // crash in cost model functions. Make SubVT so that Idx + VF of SubVT @@ -5904,11 +5904,11 @@ FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, - EEVTy, None, Idx, SubVT); + EEVTy, None, CostKind, Idx, SubVT); } } else { Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, - VecTy, None, 0, EEVTy); + VecTy, None, CostKind, 0, EEVTy); } } }; @@ -5976,7 +5976,7 @@ assert(VecTy == FinalVecTy && "No reused scalars expected for broadcast."); return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, - /*Mask=*/None, /*Index=*/0, + /*Mask=*/None, CostKind, /*Index=*/0, /*SubTp=*/nullptr, /*Args=*/VL[0]); } InstructionCost ReuseShuffleCost = 0; @@ -6065,7 +6065,7 @@ // Add the cost for the subvectors insert. for (int I = VF, E = VL.size(); I < E; I += VF) GatherCost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, - None, I, LoadTy); + None, CostKind, I, LoadTy); } return ReuseShuffleCost + GatherCost - ScalarsCost; } @@ -6249,8 +6249,9 @@ if (InsertVecSz != VecSz) { auto *ActualVecTy = FixedVectorType::get(SrcVecTy->getElementType(), VecSz); - Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, - None, OffsetBeg - Offset, InsertVecTy); + Cost += + TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None, + CostKind, OffsetBeg - Offset, InsertVecTy); } else { for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) Mask[I] = I;