Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1039,12 +1039,13 @@ const Instruction *CxtI = nullptr) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. + /// The exact mask may be passed as Mask, or else the array will be empty. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds to show the insert/extract point and the type of /// the subvector being inserted/extracted. /// NOTE: For subvector extractions Tp represents the source type. - int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0, - VectorType *SubTp = nullptr) const; + int getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index = 0, VectorType *SubTp = nullptr) const; /// Represents a hint about the context in which a cast is used. /// @@ -1555,7 +1556,8 @@ OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI = nullptr) = 0; - virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index, + virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, @@ -2013,9 +2015,9 @@ return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); } - int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp) override { - return Impl.getShuffleCost(Kind, Tp, Index, SubTp); + int getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp) override { + return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp); } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -451,7 +451,8 @@ return 1; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, int Index, + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, + ArrayRef Mask, int Index, VectorType *SubTp) const { return 1; } @@ -1043,25 +1044,30 @@ int SubIndex; if (Shuffle->isExtractSubvectorMask(SubIndex)) return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy, - SubIndex, VecTy); + Shuffle->getShuffleMask(), SubIndex, + VecTy); else if (Shuffle->changesLength()) return CostKind == TTI::TCK_RecipThroughput ? -1 : 1; else if (Shuffle->isIdentity()) return 0; else if (Shuffle->isReverse()) - return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isSelect()) - return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isTranspose()) - return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isZeroEltSplat()) - return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isSingleSource()) - return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, 0, - nullptr); + return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); - return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, 0, - nullptr); + return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); } case Instruction::ExtractElement: { unsigned Idx = -1; Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -715,8 +715,8 @@ return OpCost; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp) { + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp) { switch (Kind) { case TTI::SK_Broadcast: @@ -1255,9 +1255,9 @@ if (isa(RetTy)) return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[1])->getZExtValue(); - return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - cast(Args[0]->getType()), - Index, cast(RetTy)); + return thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, cast(Args[0]->getType()), + ArrayRef(), Index, cast(RetTy)); } case Intrinsic::experimental_vector_insert: { // FIXME: Handle case where a scalable vector is inserted into a scalable @@ -1266,13 +1266,13 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost( - TTI::SK_InsertSubvector, cast(Args[0]->getType()), Index, - cast(Args[1]->getType())); + TTI::SK_InsertSubvector, cast(Args[0]->getType()), + ArrayRef(), Index, cast(Args[1]->getType())); } case Intrinsic::experimental_vector_reverse: { - return thisT()->getShuffleCost(TTI::SK_Reverse, - cast(Args[0]->getType()), 0, - cast(RetTy)); + return thisT()->getShuffleCost( + TTI::SK_Reverse, cast(Args[0]->getType()), + ArrayRef(), 0, cast(RetTy)); } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: @@ -1907,9 +1907,9 @@ NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); // Assume the pairwise shuffles add a cost. - ShuffleCost += - (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - Ty, NumVecElts, SubTy); + ShuffleCost += (IsPairwise + 1) * thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, Ty, + ArrayRef(), NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; ++LongVectorCount; @@ -1928,8 +1928,9 @@ unsigned NumShuffles = NumReduxLevels; if (IsPairwise && NumReduxLevels >= 1) NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * - thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); + ShuffleCost += + NumShuffles * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, + ArrayRef(), 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); return ShuffleCost + ArithCost + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); @@ -1965,9 +1966,9 @@ CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); // Assume the pairwise shuffles add a cost. - ShuffleCost += - (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - Ty, NumVecElts, SubTy); + ShuffleCost += (IsPairwise + 1) * thisT()->getShuffleCost( + TTI::SK_ExtractSubvector, Ty, + ArrayRef(), NumVecElts, SubTy); MinMaxCost += thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind) + @@ -1990,8 +1991,9 @@ unsigned NumShuffles = NumReduxLevels; if (IsPairwise && NumReduxLevels >= 1) NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * - thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); + ShuffleCost += + NumShuffles * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, + ArrayRef(), 0, Ty); MinMaxCost += NumReduxLevels * (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -706,8 +706,9 @@ } int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, VectorType *Ty, - int Index, VectorType *SubTp) const { - int Cost = TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); + ArrayRef Mask, int Index, + VectorType *SubTp) const { + int Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -275,8 +275,8 @@ bool IsPairwiseForm, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); /// @} }; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1217,7 +1217,8 @@ } int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { @@ -1285,5 +1286,5 @@ return LT.first * Entry->Cost; } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -197,8 +197,8 @@ unsigned getVectorSplitCost() { return 0; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1103,7 +1103,8 @@ } unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasVOP3PInsts()) { if (cast(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { @@ -1121,7 +1122,7 @@ } } - return BaseT::getShuffleCost(Kind, VT, Index, SubTp); + return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp); } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -183,8 +183,8 @@ int getNumMemOps(const IntrinsicInst *I) const; - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1130,7 +1130,8 @@ } int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { static const CostTblEntry NEONDupTbl[] = { @@ -1221,7 +1222,7 @@ int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) : 1; - return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -122,8 +122,8 @@ getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, + int Index, Type *SubTp); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -207,7 +207,8 @@ } unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) { + ArrayRef Mask, int Index, + Type *SubTp) { return 1; } Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -107,7 +107,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, + int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -984,8 +984,8 @@ return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); } -int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, + ArrayRef Mask, int Index, Type *SubTp) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -87,8 +87,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -548,7 +548,8 @@ } int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); @@ -581,7 +582,7 @@ } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } // Return the log2 difference of the element sizes of the two vector types. Index: llvm/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -127,8 +127,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -960,7 +960,8 @@ } int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); @@ -1007,7 +1008,7 @@ SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, - ExtractIndex, SubTy); + ArrayRef(), ExtractIndex, SubTy); // If the original size is 32-bits or more, we can use pshufd. Otherwise // if we have SSSE3 we can use pshufb. @@ -1080,11 +1081,11 @@ LegalVT.getVectorNumElements()); unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; - return NumOfShuffles * - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, + ArrayRef(), 0, nullptr); } - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } // For 2-input shuffles, we must account for splitting the 2 inputs into many. @@ -1392,7 +1393,7 @@ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -3085,7 +3086,8 @@ EVT VT = TLI->getValueType(DL, Val); if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) SubTy = FixedVectorType::get(ScalarType, SubNumElts); - ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); + ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, + ArrayRef(), 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; @@ -3288,14 +3290,17 @@ if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, ArrayRef(), 0, + nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, ArrayRef(), 0, + nullptr); else if (LT.second.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes - Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, ArrayRef(), + 0, MaskTy); } // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. @@ -3528,8 +3533,8 @@ // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - ReductionCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + ReductionCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, + ArrayRef(), NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3540,8 +3545,8 @@ else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); - ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + ArrayRef(), 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3551,8 +3556,8 @@ else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); - ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + ReductionCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + ArrayRef(), 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -3832,8 +3837,8 @@ // If we're reducing from 256/512 bits, use an extract_subvector. if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); - MinMaxCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + MinMaxCost += getShuffleCost(TTI::SK_ExtractSubvector, Ty, + ArrayRef(), NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3843,8 +3848,8 @@ FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); - MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + ArrayRef(), 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3852,8 +3857,8 @@ ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4); else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); - MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + MinMaxCost += getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, + ArrayRef(), 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -4666,7 +4671,7 @@ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; unsigned ShuffleCost = - getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + getShuffleCost(ShuffleKind, SingleMemOpTy, ArrayRef(), 0, nullptr); unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; @@ -4721,8 +4726,8 @@ // There is no strided stores meanwhile. And store can't be folded in // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. - unsigned ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + unsigned ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, + ArrayRef(), 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6813,7 +6813,8 @@ bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + ArrayRef(), 0); return Cost; } @@ -6831,7 +6832,8 @@ return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS, CostKind) + - TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy); + TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, + ArrayRef()); } StoreInst *SI = cast(I); @@ -6898,7 +6900,8 @@ assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, + ArrayRef(), 0); } return Cost; } @@ -7309,9 +7312,10 @@ // First-order recurrences are replaced by vector shuffles inside the loop. // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type. if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) - return TTI.getShuffleCost( - TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), - VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + cast(VectorTy), ArrayRef(), + VF.getKnownMinValue() - 1, + FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -287,10 +287,11 @@ /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 /// ret <4 x i8> %ins4 /// InstCombiner transforms this into a shuffle and vector mul +/// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? static Optional -isShuffle(ArrayRef VL) { +isShuffle(ArrayRef VL, SmallVector &Mask) { auto *EI0 = cast(VL[0]); unsigned Size = cast(EI0->getVectorOperandType())->getNumElements(); @@ -308,9 +309,12 @@ if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) + if (Idx->getValue().uge(Size)) { + Mask.push_back(-1); continue; + } unsigned IntIdx = Idx->getValue().getZExtValue(); + Mask.push_back(IntIdx); // We can extractelement from undef or poison vector. if (isa(Vec)) continue; @@ -3466,22 +3470,25 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + ReuseShuffleCost = TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, ArrayRef()); } if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, + ArrayRef(), 0); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); + SmallVector Mask; + Optional ShuffleKind = + isShuffle(VL, Mask); if (ShuffleKind.hasValue()) { InstructionCost Cost = - TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + TTI->getShuffleCost(ShuffleKind.getValue(), VecTy, Mask); for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -3546,7 +3553,7 @@ CommonCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) { CommonCost = TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, ArrayRef()); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { Instruction *EI = cast(VL[I]); @@ -3772,7 +3779,7 @@ } if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, ArrayRef()); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } @@ -3789,7 +3796,7 @@ Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); if (IsReorder) VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, ArrayRef()); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); return VecStCost - ScalarStCost; } @@ -3856,7 +3863,8 @@ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); + VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, + ArrayRef(), 0); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -4150,7 +4158,8 @@ TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, /*Extract*/ false); if (!ShuffledIndices.empty()) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty, + ArrayRef()); return Cost; } @@ -7450,10 +7459,11 @@ BasicBlock *BB, BoUpSLP &R) { SmallVector BuildVectorInsts; SmallVector BuildVectorOpds; + SmallVector Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && - isShuffle(BuildVectorOpds))) + isShuffle(BuildVectorOpds, Mask))) return false; // Vectorize starting with the build vector operands ignoring the BuildVector Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -193,7 +193,8 @@ TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); // Optionally, we are shuffling the loaded vector element(s) into place. if (OffsetEltIndex) - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); + NewCost += + TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, ArrayRef()); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -352,8 +353,8 @@ // ShufMask = { undef, undef, 0, undef } // TODO: The cost model has an option for a "broadcast" shuffle // (splat-from-element-0), but no option for a more general splat. - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, ArrayRef()); } // Aggressively form a vector op if the cost is equal because the transform @@ -516,15 +517,6 @@ if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) return false; - // The new shuffle must not cost more than the old shuffle. The bitcast is - // moved ahead of the shuffle, so assume that it has the same cost as before. - InstructionCost DestCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); - InstructionCost SrcCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); - if (DestCost > SrcCost || !DestCost.isValid()) - return false; - unsigned DestNumElts = DestTy->getNumElements(); unsigned SrcNumElts = SrcTy->getNumElements(); SmallVector NewMask; @@ -542,6 +534,16 @@ if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) return false; } + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + InstructionCost DestCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask); + InstructionCost SrcCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); + if (DestCost > SrcCost || !DestCost.isValid()) + return false; + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; Value *CastV = Builder.CreateBitCast(V, DestTy); @@ -725,8 +727,8 @@ int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast(CmpInst::makeCmpResultType(X->getType())); InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, + ArrayRef()); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);