diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -145,6 +145,14 @@ InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, + const APInt &DemandedSrcElts, + const APInt &DemandedReplicatedElts, + TTI::TargetCostKind CostKind); + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, ArrayRef Mask, + TTI::TargetCostKind CostKind); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3621,6 +3621,133 @@ return Cost; } +InstructionCost X86TTIImpl::getReplicationShuffleCost( + Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedSrcElts, + const APInt &DemandedReplicatedElts, TTI::TargetCostKind CostKind) { + const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); + + auto bailout = [&]() { + return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, + DemandedSrcElts, + DemandedReplicatedElts, CostKind); + }; + + // For now, only implement for 32-/64- bit-sized elts with AVX512F. + if (!ST->hasAVX512() || (EltTyBits != 32 && EltTyBits != 64)) + return bailout(); + + auto *SrcVecTy = FixedVectorType::get(EltTy, VF); + int NumReplicatedElements = VF * ReplicationFactor; + auto *ReplicatedVecTy = FixedVectorType::get(EltTy, NumReplicatedElements); + + // Legalize the types. + std::pair SrcVecTyLegalization = + TLI->getTypeLegalizationCost(DL, SrcVecTy); + std::pair ReplicatedVecTyLegalization = + TLI->getTypeLegalizationCost(DL, ReplicatedVecTy); + + MVT LegalSrcVecTy = SrcVecTyLegalization.second; + MVT LegalReplicatedVecTy = ReplicatedVecTyLegalization.second; + + // They both should have legalized into vector types. + if (!LegalSrcVecTy.isVector() || !LegalReplicatedVecTy.isVector()) + return bailout(); + + assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && + LegalSrcVecTy.getScalarType() == + LegalReplicatedVecTy.getScalarType() && + "We expect that the legalization doesn't affect the element width, " + "doesn't coalesce/split elements."); + + unsigned NumSrcEltsPerVec = LegalSrcVecTy.getVectorNumElements(); + unsigned NumSrcVectors = + divideCeil(SrcVecTy->getNumElements(), NumSrcEltsPerVec); + + unsigned NumEltsPerReplicatedVec = + LegalReplicatedVecTy.getVectorNumElements(); + unsigned NumReplicatedVectors = + divideCeil(ReplicatedVecTy->getNumElements(), NumEltsPerReplicatedVec); + + auto *SingleReplicatedVecTy = + FixedVectorType::get(EltTy, NumEltsPerReplicatedVec); + + auto SrcEltIdxToSrcVecIdx = [NumSrcEltsPerVec, NumSrcVectors](int SrcEltIdx) { + unsigned SrcVecIdx = SrcEltIdx / NumSrcEltsPerVec; // truncating! + assert(SrcVecIdx < NumSrcVectors && "Out of source vectors?"); + return SrcVecIdx; + }; + + SmallVector /*DemandedSourceVector*/> ReplicatedVectors; + ReplicatedVectors.reserve(NumReplicatedVectors); + unsigned NumEltsInCurrVec = 0; + + auto GetVacantReplicatedVector = + [&ReplicatedVectors, &NumEltsInCurrVec, + NumEltsPerReplicatedVec]() -> Optional & /*DemandedSourceVector*/ { + if (ReplicatedVectors.empty() || + NumEltsInCurrVec == NumEltsPerReplicatedVec) { + ReplicatedVectors.emplace_back(); + NumEltsInCurrVec = 0; + } + return ReplicatedVectors.back(); + }; + + auto ReplicateElement = [&GetVacantReplicatedVector, &NumEltsInCurrVec, + &SrcEltIdxToSrcVecIdx, &DemandedSrcElts, + &DemandedReplicatedElts](int SrcEltIdx, int Indice, + int EltIdx) { + // Into which vector are we going to replicate this element? + Optional &DemandedSourceVector = GetVacantReplicatedVector(); + // Said vector now has one more occupied element. + ++NumEltsInCurrVec; + + // Do we actually demand this particular element to be replicated? + if (!DemandedSrcElts[SrcEltIdx] || !DemandedReplicatedElts[EltIdx]) + return; + + // Ok, we actually have to replicate the element. From which src vector? + int SrcVecIdx = SrcEltIdxToSrcVecIdx(SrcEltIdx); + // Demand this source vector for this replicated vector. + assert((!DemandedSourceVector || DemandedSourceVector == SrcVecIdx) && + "Expecting to require elements from a single source vector."); + if (!DemandedSourceVector) + DemandedSourceVector = SrcVecIdx; + }; + + int NumEltsProduced = 0; + for (int SrcEltIdx : seq(0, VF)) { + for (int Indice : seq(0, ReplicationFactor)) { + ReplicateElement(SrcEltIdx, Indice, NumEltsProduced); + ++NumEltsProduced; + } + } + assert(NumEltsProduced == NumReplicatedElements && + "Replication did not produce the expected number of elements."); + assert(ReplicatedVectors.size() == NumReplicatedVectors && + "Replication did not produce the expected number of vectors."); + + // Okay, now we just have to count how many shuffles we need. + InstructionCost Cost = 0; + for (const Optional &DemandedSourceVector : ReplicatedVectors) { + if (!DemandedSourceVector) + continue; // This replicated vector is undef. No shuffling needed. + InstructionCost ShuffleCost = getShuffleCost( + TTI::SK_PermuteSingleSrc, SingleReplicatedVecTy, /*Mask=*/None, + /*Index=*/0, /*SubTp=*/nullptr); + Cost += ShuffleCost; + } + + return Cost; +} + +InstructionCost +X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, ArrayRef Mask, + TTI::TargetCostKind CostKind) { + return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, Mask, + CostKind); +} + InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i32.ll @@ -59,11 +59,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <4 x i32> @@ -124,11 +124,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride3' -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <96 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <6 x i32> @@ -189,11 +189,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride4' -; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <128 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <8 x i32> @@ -254,11 +254,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride5' -; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <20 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <40 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <80 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <160 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <20 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <40 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <80 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <160 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <10 x i32> @@ -319,11 +319,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride6' -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <96 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <192 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <192 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <12 x i32> @@ -384,11 +384,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride7' -; AVX512-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <28 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <56 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <112 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <224 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <28 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <56 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <112 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <224 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <14 x i32> @@ -449,11 +449,11 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i32_stride8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <128 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <256 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i32> undef, <4 x i32> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i32> undef, <8 x i32> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i32> undef, <16 x i32> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf32 = shufflevector <32 x i32> undef, <32 x i32> poison, <256 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i32> undef, <2 x i32> poison, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i64.ll @@ -53,10 +53,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <32 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <4 x i32> @@ -110,10 +110,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride3' -; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <48 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <6 x i32> @@ -167,10 +167,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride4' -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 124 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <64 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <8 x i32> @@ -224,10 +224,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride5' -; AVX512-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <20 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <40 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <80 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <20 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <40 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <80 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <10 x i32> @@ -281,10 +281,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride6' -; AVX512-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <24 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <48 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <96 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <24 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <48 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <96 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <12 x i32> @@ -338,10 +338,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride7' -; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <28 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <56 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <112 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <14 x i32> @@ -395,10 +395,10 @@ ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512-LABEL: 'replication_i64_stride8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 110 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> -; AVX512-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i64> undef, <4 x i64> poison, <32 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i64> undef, <8 x i64> poison, <64 x i32> +; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %vf16 = shufflevector <16 x i64> undef, <16 x i64> poison, <128 x i32> ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %vf2 = shufflevector <2 x i64> undef, <2 x i64> poison, <16 x i32>