diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3624,28 +3624,33 @@ return Cost; } -InstructionCost X86TTIImpl::getReplicationShuffleCost( - Type *EltTy, int ReplicationFactor, int VF, - const APInt &DemandedReplicatedElts, TTI::TargetCostKind CostKind) { +InstructionCost +X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, + int VF, const APInt &DemandedDstElts, + TTI::TargetCostKind CostKind) { const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); + // We don't differentiate element types here, only element bit width. + EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); auto bailout = [&]() { return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, - DemandedReplicatedElts, CostKind); + DemandedDstElts, CostKind); }; // For now, only deal with AVX512 cases. if (!ST->hasAVX512()) return bailout(); + // Do we have a native shuffle for this element type, or should we promote? + unsigned EffEltTyBits = EltTyBits; switch (EltTyBits) { case 32: case 64: break; // AVX512F. case 16: if (!ST->hasBWI()) - return bailout(); - break; + EffEltTyBits = 32; // promote to i32, AVX512F. + break; // AVX512BW case 8: if (!ST->hasVBMI()) return bailout(); @@ -3653,45 +3658,66 @@ default: return bailout(); } + auto *EffEltTy = IntegerType::getIntNTy(EltTy->getContext(), EffEltTyBits); auto *SrcVecTy = FixedVectorType::get(EltTy, VF); - int NumReplicatedElements = VF * ReplicationFactor; - auto *ReplicatedVecTy = FixedVectorType::get(EltTy, NumReplicatedElements); + auto *EffSrcVecTy = FixedVectorType::get(EffEltTy, VF); + + int NumDstElements = VF * ReplicationFactor; + auto *EffDstVecTy = FixedVectorType::get(EffEltTy, NumDstElements); + auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); // Legalize the types. MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; - MVT LegalReplicatedVecTy = - TLI->getTypeLegalizationCost(DL, ReplicatedVecTy).second; - - // They both should have legalized into vector types. - if (!LegalSrcVecTy.isVector() || !LegalReplicatedVecTy.isVector()) + MVT LegalEffSrcVecTy = TLI->getTypeLegalizationCost(DL, EffSrcVecTy).second; + MVT LegalEffDstVecTy = TLI->getTypeLegalizationCost(DL, EffDstVecTy).second; + MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; + // They should have legalized into vector types. + if (!LegalSrcVecTy.isVector() || !LegalEffSrcVecTy.isVector() || + !LegalEffDstVecTy.isVector() || !LegalDstVecTy.isVector()) return bailout(); + if (EffEltTyBits != EltTyBits) { + // If we have to perform the shuffle with wider elt type than our data type, + // then we will first need to anyext (we don't care about the new bits) + // the source elements, and then truncate Dst elements. + InstructionCost PromotionCost; + PromotionCost += getCastInstrCost( + Instruction::SExt, /*Dst=*/EffSrcVecTy, /*Src=*/SrcVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + PromotionCost += + getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, + /*Src=*/EffDstVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + return PromotionCost + getReplicationShuffleCost(EffEltTy, + ReplicationFactor, VF, + DemandedDstElts, CostKind); + } + assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && - LegalSrcVecTy.getScalarType() == - LegalReplicatedVecTy.getScalarType() && + LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && "We expect that the legalization doesn't affect the element width, " "doesn't coalesce/split elements."); - unsigned NumEltsPerReplicatedVec = - LegalReplicatedVecTy.getVectorNumElements(); - unsigned NumReplicatedVectors = - divideCeil(ReplicatedVecTy->getNumElements(), NumEltsPerReplicatedVec); + unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); + unsigned NumDstVectors = + divideCeil(DstVecTy->getNumElements(), NumEltsPerDstVec); - auto *SingleReplicatedVecTy = - FixedVectorType::get(EltTy, NumEltsPerReplicatedVec); + auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); - APInt DemandedReplicatedVectors = APIntOps::ScaleBitMask( - DemandedReplicatedElts.zextOrSelf(NumReplicatedVectors * - NumEltsPerReplicatedVec), - NumReplicatedVectors); - unsigned NumReplicatedVectorsDemanded = - DemandedReplicatedVectors.countPopulation(); + // Not all the produced Dst elements may be demanded. In our case, + // given that a single Dst vector is formed by a single shuffle, + // if all elements that will form a single Dst vector aren't demanded, + // then we won't need to do that shuffle, so adjust the cost accordingly. + APInt DemandedDstVectors = APIntOps::ScaleBitMask( + DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), + NumDstVectors); + unsigned NumDstVectorsDemanded = DemandedDstVectors.countPopulation(); InstructionCost SingleShuffleCost = - getShuffleCost(TTI::SK_PermuteSingleSrc, SingleReplicatedVecTy, + getShuffleCost(TTI::SK_PermuteSingleSrc, SingleDstVecTy, /*Mask=*/None, /*Index=*/0, /*SubTp=*/nullptr); - return NumReplicatedVectorsDemanded * SingleShuffleCost; + return NumDstVectorsDemanded * SingleShuffleCost; } InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll @@ -74,12 +74,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride2' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride2' @@ -165,12 +165,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride3' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride3' @@ -256,12 +256,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride4' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride4' @@ -347,12 +347,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride5' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 236 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 472 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride5' @@ -438,12 +438,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride6' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride6' @@ -529,12 +529,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride7' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 308 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 616 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride7' @@ -620,12 +620,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 344 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 688 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride8'