diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1039,11 +1039,13 @@ const Instruction *CxtI = nullptr) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. + /// The exact mask may be passed as Mask, or else the array will be empty. /// The index and subtype parameters are used by the subvector insertion and /// extraction shuffle kinds to show the insert/extract point and the type of /// the subvector being inserted/extracted. /// NOTE: For subvector extractions Tp represents the source type. - int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index = 0, + int getShuffleCost(ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask = None, int Index = 0, VectorType *SubTp = nullptr) const; /// Represents a hint about the context in which a cast is used. @@ -1555,7 +1557,8 @@ OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, OperandValueProperties Opd2PropInfo, ArrayRef Args, const Instruction *CxtI = nullptr) = 0; - virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index, + virtual int getShuffleCost(ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp) = 0; virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, @@ -2013,9 +2016,9 @@ return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); } - int getShuffleCost(ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp) override { - return Impl.getShuffleCost(Kind, Tp, Index, SubTp); + int getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp) override { + return Impl.getShuffleCost(Kind, Tp, Mask, Index, SubTp); } int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, CastContextHint CCH, TTI::TargetCostKind CostKind, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -451,7 +451,8 @@ return 1; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, int Index, + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, + ArrayRef Mask, int Index, VectorType *SubTp) const { return 1; } @@ -1043,25 +1044,30 @@ int SubIndex; if (Shuffle->isExtractSubvectorMask(SubIndex)) return TargetTTI->getShuffleCost(TTI::SK_ExtractSubvector, VecSrcTy, - SubIndex, VecTy); + Shuffle->getShuffleMask(), SubIndex, + VecTy); else if (Shuffle->changesLength()) return CostKind == TTI::TCK_RecipThroughput ? -1 : 1; else if (Shuffle->isIdentity()) return 0; else if (Shuffle->isReverse()) - return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Reverse, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isSelect()) - return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Select, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isTranspose()) - return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Transpose, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isZeroEltSplat()) - return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, 0, nullptr); + return TargetTTI->getShuffleCost(TTI::SK_Broadcast, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); else if (Shuffle->isSingleSource()) - return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, 0, - nullptr); + return TargetTTI->getShuffleCost(TTI::SK_PermuteSingleSrc, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); - return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, 0, - nullptr); + return TargetTTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, + Shuffle->getShuffleMask(), 0, nullptr); } case Instruction::ExtractElement: { unsigned Idx = -1; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -715,8 +715,8 @@ return OpCost; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp) { + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp) { switch (Kind) { case TTI::SK_Broadcast: @@ -1256,7 +1256,7 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[1])->getZExtValue(); return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - cast(Args[0]->getType()), + cast(Args[0]->getType()), None, Index, cast(RetTy)); } case Intrinsic::experimental_vector_insert: { @@ -1266,13 +1266,13 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost( - TTI::SK_InsertSubvector, cast(Args[0]->getType()), Index, - cast(Args[1]->getType())); + TTI::SK_InsertSubvector, cast(Args[0]->getType()), None, + Index, cast(Args[1]->getType())); } case Intrinsic::experimental_vector_reverse: { return thisT()->getShuffleCost(TTI::SK_Reverse, - cast(Args[0]->getType()), 0, - cast(RetTy)); + cast(Args[0]->getType()), None, + 0, cast(RetTy)); } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: @@ -1907,9 +1907,9 @@ NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); // Assume the pairwise shuffles add a cost. - ShuffleCost += - (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - Ty, NumVecElts, SubTy); + ShuffleCost += (IsPairwise + 1) * + thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; ++LongVectorCount; @@ -1928,8 +1928,8 @@ unsigned NumShuffles = NumReduxLevels; if (IsPairwise && NumReduxLevels >= 1) NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * - thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); + ShuffleCost += NumShuffles * thisT()->getShuffleCost( + TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); return ShuffleCost + ArithCost + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); @@ -1965,9 +1965,9 @@ CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); // Assume the pairwise shuffles add a cost. - ShuffleCost += - (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, - Ty, NumVecElts, SubTy); + ShuffleCost += (IsPairwise + 1) * + thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, + NumVecElts, SubTy); MinMaxCost += thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind) + @@ -1990,8 +1990,8 @@ unsigned NumShuffles = NumReduxLevels; if (IsPairwise && NumReduxLevels >= 1) NumShuffles += NumReduxLevels - 1; - ShuffleCost += NumShuffles * - thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); + ShuffleCost += NumShuffles * thisT()->getShuffleCost( + TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); MinMaxCost += NumReduxLevels * (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -706,8 +706,9 @@ } int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, VectorType *Ty, - int Index, VectorType *SubTp) const { - int Cost = TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp); + ArrayRef Mask, int Index, + VectorType *SubTp) const { + int Cost = TTIImpl->getShuffleCost(Kind, Ty, Mask, Index, SubTp); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -275,8 +275,8 @@ bool IsPairwiseForm, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); /// @} }; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1217,7 +1217,8 @@ } int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_Reverse) { @@ -1289,5 +1290,5 @@ return LT.first * Entry->Cost; } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -197,8 +197,8 @@ unsigned getVectorSplitCost() { return 0; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, + ArrayRef Mask, int Index, VectorType *SubTp); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1103,7 +1103,8 @@ } unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasVOP3PInsts()) { if (cast(VT)->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { @@ -1121,7 +1122,7 @@ } } - return BaseT::getShuffleCost(Kind, VT, Index, SubTp); + return BaseT::getShuffleCost(Kind, VT, Mask, Index, SubTp); } bool GCNTTIImpl::areInlineCompatible(const Function *Caller, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -183,8 +183,8 @@ int getNumMemOps(const IntrinsicInst *I) const; - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1131,7 +1131,8 @@ } int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasNEON()) { if (Kind == TTI::SK_Broadcast) { static const CostTblEntry NEONDupTbl[] = { @@ -1222,7 +1223,7 @@ int BaseCost = ST->hasMVEIntegerOps() && Tp->isVectorTy() ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) : 1; - return BaseCost * BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseCost * BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -122,8 +122,8 @@ getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, + int Index, Type *SubTp); unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -207,7 +207,8 @@ } unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, - int Index, Type *SubTp) { + ArrayRef Mask, int Index, + Type *SubTp) { return 1; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -107,7 +107,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, ArrayRef Mask, + int Index, Type *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -984,8 +984,8 @@ return vectorCostAdjustment(Cost, Opcode, Ty, nullptr); } -int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, + ArrayRef Mask, int Index, Type *SubTp) { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -87,8 +87,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy); unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy); unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -548,7 +548,8 @@ } int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { if (ST->hasVector()) { unsigned NumVectors = getNumVectorRegs(Tp); @@ -581,7 +582,7 @@ } } - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp); } // Return the log2 difference of the element sizes of the two vector types. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -127,8 +127,8 @@ TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, - VectorType *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, + int Index, VectorType *SubTp); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -960,7 +960,8 @@ } int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp, - int Index, VectorType *SubTp) { + ArrayRef Mask, int Index, + VectorType *SubTp) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are widened to type v4i32. std::pair LT = TLI->getTypeLegalizationCost(DL, BaseTp); @@ -1006,7 +1007,7 @@ auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), SubLT.second.getVectorNumElements()); int ExtractIndex = alignDown((Index % NumElts), NumSubElts); - int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, + int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); // If the original size is 32-bits or more, we can use pshufd. Otherwise @@ -1080,11 +1081,11 @@ LegalVT.getVectorNumElements()); unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; - return NumOfShuffles * - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr); + return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, + None, 0, nullptr); } - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } // For 2-input shuffles, we must account for splitting the 2 inputs into many. @@ -1392,7 +1393,7 @@ if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp); + return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); } int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -3085,7 +3086,8 @@ EVT VT = TLI->getValueType(DL, Val); if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) SubTy = FixedVectorType::get(ScalarType, SubNumElts); - ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy); + ShuffleCost = + getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, None, 0, SubTy); } int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; @@ -3288,14 +3290,14 @@ if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, 0, nullptr) + - getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr); + Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, None, 0, nullptr) + + getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, None, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(), LT.second.getVectorNumElements()); // Expanding requires fill mask with zeroes - Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy); + Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, None, 0, MaskTy); } // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. @@ -3529,7 +3531,7 @@ if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); ReductionCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3541,7 +3543,7 @@ ShufTy = FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2); ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3552,7 +3554,7 @@ ShufTy = FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4); ReductionCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -3833,7 +3835,7 @@ if (Size > 128) { auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts); MinMaxCost += - getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); Ty = SubTy; } else if (Size == 128) { // Reducing from 128 bits is a permute of v2f64/v2i64. @@ -3844,7 +3846,7 @@ else ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else if (Size == 64) { // Reducing from 64 bits is a shuffle of v4f32/v4i32. FixedVectorType *ShufTy; @@ -3853,7 +3855,7 @@ else ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); MinMaxCost += - getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, None, 0, nullptr); } else { // Reducing from smaller size is a shift by immediate. auto *ShiftTy = FixedVectorType::get( @@ -4666,7 +4668,7 @@ (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; unsigned ShuffleCost = - getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr); + getShuffleCost(ShuffleKind, SingleMemOpTy, None, 0, nullptr); unsigned NumOfLoadsInInterleaveGrp = Indices.size() ? Indices.size() : Factor; @@ -4722,7 +4724,7 @@ // shuffle. unsigned NumOfSources = Factor; // The number of values to be merged. unsigned ShuffleCost = - getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr); + getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, None, 0, nullptr); unsigned NumOfShufflesPerStore = NumOfSources - 1; // The SK_MergeTwoSrc shuffle clobbers one of src operands. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6794,7 +6794,8 @@ bool Reverse = ConsecutiveStride < 0; if (Reverse) - Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); return Cost; } @@ -6878,8 +6879,9 @@ // TODO: Add support for reversed masked interleaved access. assert(!Legal->isMaskRequired(I) && "Reverse masked interleaved access not supported."); - Cost += Group->getNumMembers() * - TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + Cost += + Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0); } return Cost; } @@ -7292,7 +7294,7 @@ if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi)) return TTI.getShuffleCost( TargetTransformInfo::SK_ExtractSubvector, cast(VectorTy), - VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); + None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1)); // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -287,10 +287,11 @@ /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 /// ret <4 x i8> %ins4 /// InstCombiner transforms this into a shuffle and vector mul +/// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? static Optional -isShuffle(ArrayRef VL) { +isShuffle(ArrayRef VL, SmallVectorImpl &Mask) { auto *EI0 = cast(VL[0]); unsigned Size = cast(EI0->getVectorOperandType())->getNumElements(); @@ -308,9 +309,12 @@ if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) + if (Idx->getValue().uge(Size)) { + Mask.push_back(UndefMaskElem); continue; + } unsigned IntIdx = Idx->getValue().getZExtValue(); + Mask.push_back(IntIdx); // We can extractelement from undef or poison vector. if (isa(Vec)) continue; @@ -3467,21 +3471,25 @@ InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) { ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, + E->ReuseShuffleIndices); } if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, None, + 0); } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); + SmallVector Mask; + Optional ShuffleKind = + isShuffle(VL, Mask); if (ShuffleKind.hasValue()) { InstructionCost Cost = - TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); + TTI->getShuffleCost(ShuffleKind.getValue(), VecTy, Mask); for (auto *V : VL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this @@ -3545,8 +3553,10 @@ } CommonCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) { + SmallVector NewMask; + inversePermutation(E->ReorderIndices, NewMask); CommonCost = TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); } for (unsigned I = 0, E = VL.size(); I < E; ++I) { Instruction *EI = cast(VL[I]); @@ -3770,9 +3780,12 @@ Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), /*VariableMask=*/false, alignment, CostKind, VL0); } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) + if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) { + SmallVector NewMask; + inversePermutation(E->ReorderIndices, NewMask); VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); + } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); return ReuseShuffleCost + VecLdCost - ScalarLdCost; } @@ -3787,9 +3800,12 @@ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) + if (IsReorder) { + SmallVector NewMask; + inversePermutation(E->ReorderIndices, NewMask); VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + TargetTransformInfo::SK_PermuteSingleSrc, VecTy, NewMask); + } LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); return VecStCost - ScalarStCost; } @@ -3856,7 +3872,15 @@ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); + + SmallVector Mask(E->Scalars.size()); + for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { + auto *OpInst = cast(E->Scalars[I]); + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); + } + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -7448,10 +7472,11 @@ BasicBlock *BB, BoUpSLP &R) { SmallVector BuildVectorInsts; SmallVector BuildVectorOpds; + SmallVector Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa(V); }) && - isShuffle(BuildVectorOpds))) + isShuffle(BuildVectorOpds, Mask))) return false; // Vectorize starting with the build vector operands ignoring the BuildVector diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -192,8 +192,18 @@ InstructionCost NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); // Optionally, we are shuffling the loaded vector element(s) into place. + // For the mask set everything but element 0 to undef to prevent poison from + // propagating from the extra loaded memory. This will also optionally + // shrink/grow the vector from the loaded size to the output size. + // We assume this operation has no cost in codegen if there was no offset. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector Mask(OutputNumElts, UndefMaskElem); + assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); + Mask[0] = OffsetEltIndex; if (OffsetEltIndex) - NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask); // We can aggressively convert to the vector form because the backend can // invert this transform if it does not result in a performance win. @@ -205,17 +215,6 @@ IRBuilder<> Builder(Load); Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - - // Set everything but element 0 to undef to prevent poison from propagating - // from the extra loaded memory. This will also optionally shrink/grow the - // vector from the loaded size to the output size. - // We assume this operation has no cost in codegen if there was no offset. - // Note that we could use freeze to avoid poison problems, but then we might - // still need a shuffle to change the vector size. - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector Mask(OutputNumElts, UndefMaskElem); - assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); - Mask[0] = OffsetEltIndex; VecLd = Builder.CreateShuffleVector(VecLd, Mask); replaceValue(I, *VecLd); @@ -516,15 +515,6 @@ if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) return false; - // The new shuffle must not cost more than the old shuffle. The bitcast is - // moved ahead of the shuffle, so assume that it has the same cost as before. - InstructionCost DestCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); - InstructionCost SrcCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); - if (DestCost > SrcCost || !DestCost.isValid()) - return false; - unsigned DestNumElts = DestTy->getNumElements(); unsigned SrcNumElts = SrcTy->getNumElements(); SmallVector NewMask; @@ -542,6 +532,16 @@ if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask)) return false; } + + // The new shuffle must not cost more than the old shuffle. The bitcast is + // moved ahead of the shuffle, so assume that it has the same cost as before. + InstructionCost DestCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, DestTy, NewMask); + InstructionCost SrcCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy, Mask); + if (DestCost > SrcCost || !DestCost.isValid()) + return false; + // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; Value *CastV = Builder.CreateBitCast(V, DestTy); @@ -725,8 +725,10 @@ int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast(CmpInst::makeCmpResultType(X->getType())); InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); + SmallVector ShufMask(VecTy->getNumElements(), UndefMaskElem); + ShufMask[CheapIndex] = ExpensiveIndex; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy, + ShufMask); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);