diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3622,12 +3622,15 @@ } InstructionCost X86TTIImpl::getReplicationShuffleCost( - Type *EltTy, int ReplicationFactor, int VF, + Type *SrcEltTy, int ReplicationFactor, int VF, const APInt &DemandedReplicatedElts, TTI::TargetCostKind CostKind) { - const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); + const unsigned SrcEltTyBits = DL.getTypeSizeInBits(SrcEltTy); + // We don't differentiate element types here, only element bit width. + SrcEltTy = IntegerType::getIntNTy(SrcEltTy->getContext(), SrcEltTyBits); + auto *SrcVecTy = FixedVectorType::get(SrcEltTy, VF); auto bailout = [&]() { - return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, + return BaseT::getReplicationShuffleCost(SrcEltTy, ReplicationFactor, VF, DemandedReplicatedElts, CostKind); }; @@ -3635,14 +3638,16 @@ if (!ST->hasAVX512()) return bailout(); - switch (EltTyBits) { + // Do we have a native shuffle for this element type, or should we promote? + unsigned ShufEltTyBits = SrcEltTyBits; + switch (SrcEltTyBits) { case 32: case 64: break; // AVX512F. case 16: if (!ST->hasBWI()) - return bailout(); - break; + ShufEltTyBits = 32; // promote to i32, AVX512F. + break; // AVX512BW case 8: if (!ST->hasVBMI()) return bailout(); @@ -3650,45 +3655,94 @@ default: return bailout(); } + const bool PerformPromotion = ShufEltTyBits != SrcEltTyBits; + + Type *ShufEltTy; + if (!PerformPromotion) + ShufEltTy = SrcEltTy; + else { + assert(ShufEltTyBits > SrcEltTyBits && "Should have widened!"); + ShufEltTy = IntegerType::getIntNTy(SrcEltTy->getContext(), ShufEltTyBits); + } + assert(DL.getTypeSizeInBits(ShufEltTy) == ShufEltTyBits && + "ShufEltTy does not have requested (ShufEltTyBits) bit width?"); + + // Before shuffling, we'll have to first extend the input vector to this type. + auto *ShufVecTy = FixedVectorType::get(ShufEltTy, VF); - auto *SrcVecTy = FixedVectorType::get(EltTy, VF); int NumReplicatedElements = VF * ReplicationFactor; - auto *ReplicatedVecTy = FixedVectorType::get(EltTy, NumReplicatedElements); + // After replication, we'll get this vector ... + auto *ShufReplicatedVecTy = + FixedVectorType::get(ShufEltTy, NumReplicatedElements); + // ... which we will then have to truncate to this type. + auto *TgtReplicatedVecTy = + FixedVectorType::get(SrcEltTy, NumReplicatedElements); + + InstructionCost PromotionCost; + if (PerformPromotion) { + // If we have to perform the shuffle with wider elt type than our data type, + // then we will first need to anyext (we don't care about the new bits) + // the source elements, and then truncate replicated elements. + PromotionCost += + getCastInstrCost(Instruction::SExt, /*Dst=*/ShufVecTy, /*Src=*/SrcVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + PromotionCost += + getCastInstrCost(Instruction::Trunc, /*Dst=*/TgtReplicatedVecTy, + /*Src=*/ShufReplicatedVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + } // Legalize the types. MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; - MVT LegalReplicatedVecTy = - TLI->getTypeLegalizationCost(DL, ReplicatedVecTy).second; + MVT LegalShufVecTy = TLI->getTypeLegalizationCost(DL, ShufVecTy).second; + MVT LegalShufReplicatedVecTy = + TLI->getTypeLegalizationCost(DL, ShufReplicatedVecTy).second; + MVT LegalTgtReplicatedVecTy = + TLI->getTypeLegalizationCost(DL, TgtReplicatedVecTy).second; - // They both should have legalized into vector types. - if (!LegalSrcVecTy.isVector() || !LegalReplicatedVecTy.isVector()) + // They all should have legalized into vector types. + if (!LegalSrcVecTy.isVector() || !LegalShufVecTy.isVector() || + !LegalShufReplicatedVecTy.isVector() || + !LegalTgtReplicatedVecTy.isVector()) return bailout(); - assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && - LegalSrcVecTy.getScalarType() == - LegalReplicatedVecTy.getScalarType() && + assert(LegalSrcVecTy.getScalarSizeInBits() == SrcEltTyBits && + LegalShufVecTy.getScalarSizeInBits() == ShufEltTyBits && + LegalShufReplicatedVecTy.getScalarSizeInBits() == ShufEltTyBits && + LegalTgtReplicatedVecTy.getScalarSizeInBits() == SrcEltTyBits && + LegalShufVecTy.getScalarType() == + LegalShufReplicatedVecTy.getScalarType() && + LegalTgtReplicatedVecTy.getScalarType() == + LegalSrcVecTy.getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements."); - unsigned NumEltsPerReplicatedVec = - LegalReplicatedVecTy.getVectorNumElements(); - unsigned NumReplicatedVectors = - divideCeil(ReplicatedVecTy->getNumElements(), NumEltsPerReplicatedVec); + // How wide of a vector will the replication shuffles produce? + unsigned NumEltsPerShufReplicatedVec = + LegalShufReplicatedVecTy.getVectorNumElements(); + auto *SingleShufReplicatedVecTy = + FixedVectorType::get(ShufEltTy, NumEltsPerShufReplicatedVec); - auto *SingleReplicatedVecTy = - FixedVectorType::get(EltTy, NumEltsPerReplicatedVec); + // And how many replicated vectors (and thus shuffles) will there be? + unsigned NumShufReplicatedVectors = divideCeil( + ShufReplicatedVecTy->getNumElements(), NumEltsPerShufReplicatedVec); - APInt DemandedReplicatedVectors = APIntOps::ScaleBitMask( - DemandedReplicatedElts.zextOrSelf(NumReplicatedVectors * - NumEltsPerReplicatedVec), - NumReplicatedVectors); - unsigned NumReplicatedVectorsDemanded = - DemandedReplicatedVectors.countPopulation(); + // Not all the produced replicated elements may be demanded. In our case, + // given that a single replicated vector is formed by a single shuffle, + // if all elements that will form a single replicated vector aren't demanded, + // then we won't need to do that shuffle, so adjust the cost accordingly. + APInt DemandedShufReplicatedVectors = APIntOps::ScaleBitMask( + DemandedReplicatedElts.zextOrSelf(NumShufReplicatedVectors * + NumEltsPerShufReplicatedVec), + NumShufReplicatedVectors); + unsigned NumShufReplicatedVectorsDemanded = + DemandedShufReplicatedVectors.countPopulation(); - InstructionCost SingleShuffleCost = - getShuffleCost(TTI::SK_PermuteSingleSrc, SingleReplicatedVecTy, + InstructionCost SingleShufShuffleCost = + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleShufReplicatedVecTy, /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); - return NumReplicatedVectorsDemanded * SingleShuffleCost; + return PromotionCost + + NumShufReplicatedVectorsDemanded * SingleShufShuffleCost; } InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll @@ -74,12 +74,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride2' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride2' @@ -165,12 +165,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride3' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 285 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 570 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride3' @@ -256,12 +256,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride4' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride4' @@ -347,12 +347,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride5' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 236 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 472 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 113 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 232 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 473 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 946 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride5' @@ -438,12 +438,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride6' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 283 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 567 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1134 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride6' @@ -529,12 +529,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride7' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 308 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 616 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 326 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 661 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride7' @@ -620,12 +620,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 344 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 688 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride8'