diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -40,6 +40,7 @@ #include "X86TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" @@ -3810,61 +3811,102 @@ auto *SrcVecTy = FixedVectorType::get(EltTy, VF); auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); - int NumDstElements = VF * ReplicationFactor; - auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); - auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); + // What is our naive guess at how many elements the replication will produce? + // Some vectors may end up being duplicate or not demanded, + // so we may end up producing less vectors (and thus, elements) than that. + int NaiveNumDstElements = VF * ReplicationFactor; + auto *PromNaiveDstVecTy = + FixedVectorType::get(PromEltTy, NaiveNumDstElements); // Legalize the types. MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; - MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; - MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; + MVT LegalPromNaiveDstVecTy = + TLI->getTypeLegalizationCost(DL, PromNaiveDstVecTy).second; // They should have legalized into vector types. if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || - !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) + !LegalPromNaiveDstVecTy.isVector()) return bailout(); + auto *SinglePromNaiveDstVecTy = + FixedVectorType::get(PromNaiveDstVecTy->getElementType(), + LegalPromNaiveDstVecTy.getVectorNumElements()); + + ArrayRef PrevSubMask; + ArrayRef CurrSubMask; + + auto IsNewUniqueSubMask = [&PrevSubMask, &CurrSubMask]() { + assert(!CurrSubMask.empty() && "The current submask is never empty."); + // Is this the first submask we are looking at? + if (PrevSubMask.empty()) + return true; + assert(CurrSubMask.size() <= PrevSubMask.size() && + "The mask size will at most decrease (once, for the final mask)."); + return CurrSubMask != PrevSubMask.take_front(CurrSubMask.size()); + }; + + const auto ReplicatedMask = createReplicatedMask(ReplicationFactor, VF); + ArrayRef RemainingMask = ReplicatedMask; + + auto ShouldCurrSubMaskBeDemanded = [DemandedDstElts, &RemainingMask, + &CurrSubMask, + ReplicatedMaskSize = + ReplicatedMask.size()]() { + size_t ProcessedMaskSize = ReplicatedMaskSize - RemainingMask.size(); + return !DemandedDstElts.extractBits(CurrSubMask.size(), ProcessedMaskSize) + .isZero(); + }; + + unsigned NumDemandedUniqueSubMasks = 0; + bool IsCurrUniqueSubMaskDemanded = false; + int NumDstElements = 0; + + while (!RemainingMask.empty()) { + size_t CurrMaskSize = std::min( + SinglePromNaiveDstVecTy->getNumElements(), RemainingMask.size()); + assert(CurrMaskSize != 0 && "The current submask is never empty."); + CurrSubMask = RemainingMask.take_front(CurrMaskSize); + + if (IsNewUniqueSubMask()) { + IsCurrUniqueSubMaskDemanded = false; + PrevSubMask = CurrSubMask; + } + + if (!IsCurrUniqueSubMaskDemanded) { + IsCurrUniqueSubMaskDemanded = ShouldCurrSubMaskBeDemanded(); + if (IsCurrUniqueSubMaskDemanded) { + ++NumDemandedUniqueSubMasks; + NumDstElements += CurrMaskSize; + } + } + + RemainingMask = RemainingMask.drop_front(CurrSubMask.size()); + } + + auto *PromRealDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); + auto *RealDstVecTy = FixedVectorType::get(EltTy, NumDstElements); + + InstructionCost CastCost; + if (PromEltTyBits != EltTyBits) { // If we have to perform the shuffle with wider elt type than our data type, // then we will first need to anyext (we don't care about the new bits) // the source elements, and then truncate Dst elements. - InstructionCost PromotionCost; - PromotionCost += getCastInstrCost( + CastCost += getCastInstrCost( Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, TargetTransformInfo::CastContextHint::None, CostKind); - PromotionCost += - getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, - /*Src=*/PromDstVecTy, + CastCost += + getCastInstrCost(Instruction::Trunc, /*Dst=*/RealDstVecTy, + /*Src=*/PromRealDstVecTy, TargetTransformInfo::CastContextHint::None, CostKind); - return PromotionCost + getReplicationShuffleCost(PromEltTy, - ReplicationFactor, VF, - DemandedDstElts, CostKind); } - assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && - LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && - "We expect that the legalization doesn't affect the element width, " - "doesn't coalesce/split elements."); - - unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); - unsigned NumDstVectors = - divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); - - auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); - - // Not all the produced Dst elements may be demanded. In our case, - // given that a single Dst vector is formed by a single shuffle, - // if all elements that will form a single Dst vector aren't demanded, - // then we won't need to do that shuffle, so adjust the cost accordingly. - APInt DemandedDstVectors = APIntOps::ScaleBitMask( - DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), - NumDstVectors); - unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); - InstructionCost SingleShuffleCost = - getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, + getShuffleCost(TTI::SK_PermuteSingleSrc, SinglePromNaiveDstVecTy, /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); - return NumDstVectorsDemanded * SingleShuffleCost; + InstructionCost ShuffleCost = NumDemandedUniqueSubMasks * SingleShuffleCost; + + return CastCost + ShuffleCost; } InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll @@ -380,7 +380,7 @@ ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512FVEC256-LABEL: 'replication_i64_stride7' -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> @@ -444,10 +444,10 @@ ; AVX512FVEC512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512FVEC256-LABEL: 'replication_i64_stride8' -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> -; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> +; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> ; AVX512FVEC256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32>