diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -145,6 +145,14 @@ InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, + const APInt &DemandedSrcElts, + const APInt &DemandedReplicatedElts, + TTI::TargetCostKind CostKind); + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, ArrayRef Mask, + TTI::TargetCostKind CostKind); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3621,6 +3621,132 @@ return Cost; } +InstructionCost X86TTIImpl::getReplicationShuffleCost( + Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedSrcElts, + const APInt &DemandedReplicatedElts, TTI::TargetCostKind CostKind) { + const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); + + auto bailout = [&]() { + return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, + DemandedSrcElts, + DemandedReplicatedElts, CostKind); + }; + + // For now, only implement for 32-/64- bit-sized elts with AVX512F. + if (!ST->hasAVX512() || (EltTyBits != 32 && EltTyBits != 64)) + return bailout(); + + auto *SrcVecTy = FixedVectorType::get(EltTy, VF); + int NumReplicatedElements = VF * ReplicationFactor; + auto *ReplicatedVecTy = FixedVectorType::get(EltTy, NumReplicatedElements); + + // Legalize the types. + std::pair SrcVecTyLegalization = + TLI->getTypeLegalizationCost(DL, SrcVecTy); + std::pair ReplicatedVecTyLegalization = + TLI->getTypeLegalizationCost(DL, ReplicatedVecTy); + + MVT LegalSrcVecTy = SrcVecTyLegalization.second; + MVT LegalReplicatedVecTy = ReplicatedVecTyLegalization.second; + + // They both should have legalized into vector types. + if (!LegalSrcVecTy.isVector() || !LegalReplicatedVecTy.isVector()) + return bailout(); + + assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && + LegalSrcVecTy.getScalarType() == + LegalReplicatedVecTy.getScalarType() && + "We expect that the legalization doesn't affect the element width, " + "doesn't coalesce/split elements."); + + unsigned NumEltsPerReplicatedVec = + LegalReplicatedVecTy.getVectorNumElements(); + unsigned NumReplicatedVectors = + divideCeil(ReplicatedVecTy->getNumElements(), NumEltsPerReplicatedVec); + + auto *SingleReplicatedVecTy = + FixedVectorType::get(EltTy, NumEltsPerReplicatedVec); + + APInt DemandedReplicatedVectors = APIntOps::ScaleBitMask( + DemandedReplicatedElts.zextOrSelf(NumReplicatedVectors * + NumEltsPerReplicatedVec), + NumReplicatedVectors); + unsigned NumReplicatedVectorsDemanded = + DemandedReplicatedVectors.countPopulation(); + +#ifndef NDEBUG + unsigned NumSrcEltsPerVec = LegalSrcVecTy.getVectorNumElements(); + unsigned NumSrcVectors = + divideCeil(SrcVecTy->getNumElements(), NumSrcEltsPerVec); + + auto SrcEltIdxToSrcVecIdx = [NumSrcEltsPerVec, NumSrcVectors](int SrcEltIdx) { + unsigned SrcVecIdx = SrcEltIdx / NumSrcEltsPerVec; // truncating! + assert(SrcVecIdx < NumSrcVectors && "Out of source vectors?"); + return SrcVecIdx; + }; + + unsigned CurrReplicatedVecIdx = 0; + unsigned NumEltsInCurrVec = 0; + Optional DemandedSrcVecForCurrVec; + unsigned NumReplicatedVectorsProduced = 0; + + auto ReplicateElement = [&](int SrcEltIdx, int Indice, int EltIdx) { + // Into which vector are we going to replicate this element? + if (NumEltsInCurrVec == NumEltsPerReplicatedVec) { + ++CurrReplicatedVecIdx; + NumEltsInCurrVec = 0; + DemandedSrcVecForCurrVec.reset(); + } + + // Said vector now has one more occupied element. + ++NumEltsInCurrVec; + + // Do we actually demand this particular element to be replicated? + if (!DemandedSrcElts[SrcEltIdx] || !DemandedReplicatedElts[EltIdx]) + return; + + // Ok, we actually have to replicate the element. From which src vector? + int SrcVecIdx = SrcEltIdxToSrcVecIdx(SrcEltIdx); + // Demand this source vector for this replicated vector. + assert( + (!DemandedSrcVecForCurrVec || DemandedSrcVecForCurrVec == SrcVecIdx) && + "Expecting single replicated vector to require elements from a single " + "source vector."); + if (!DemandedSrcVecForCurrVec) { + DemandedSrcVecForCurrVec = SrcVecIdx; + ++NumReplicatedVectorsProduced; + } + }; + + int NumEltsProduced = 0; + for (int SrcEltIdx : seq(0, VF)) { + for (int Indice : seq(0, ReplicationFactor)) { + ReplicateElement(SrcEltIdx, Indice, NumEltsProduced); + ++NumEltsProduced; + } + } + assert(NumEltsProduced == NumReplicatedElements && + "Replication did not produce the expected number of elements."); + assert(NumReplicatedVectorsProduced <= NumReplicatedVectors && + "Replication did not produce the expected number of elements."); + assert(NumReplicatedVectorsProduced == NumReplicatedVectorsDemanded && + "FINAL CHECK: two algorithms should produce the same answer."); +#endif + + InstructionCost SingleShuffleCost = + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleReplicatedVecTy, + /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); + return NumReplicatedVectorsDemanded * SingleShuffleCost; +} + +InstructionCost +X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, ArrayRef Mask, + TTI::TargetCostKind CostKind) { + return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, Mask, + CostKind); +} + InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll @@ -59,11 +59,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> @@ -124,11 +124,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride3' -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <96 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> @@ -189,11 +189,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride4' -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <128 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> @@ -254,11 +254,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride5' -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <20 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <40 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <80 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <160 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <20 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <40 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <80 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <160 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> @@ -319,11 +319,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride6' -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <96 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <192 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <192 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> @@ -384,11 +384,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride7' -; AVX512-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <28 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <56 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <112 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <224 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <28 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <56 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <112 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <224 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> @@ -449,11 +449,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <128 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <256 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <256 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll @@ -53,10 +53,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> @@ -110,10 +110,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride3' -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <48 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> @@ -167,10 +167,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride4' -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> @@ -224,10 +224,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride5' -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <20 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <40 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <80 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <20 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <40 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <80 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> @@ -281,10 +281,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride6' -; AVX512-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <96 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> @@ -338,10 +338,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride7' -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> @@ -395,10 +395,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32>