diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3629,6 +3629,8 @@ int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { const unsigned EltTyBits = DL.getTypeSizeInBits(EltTy); + // We don't differentiate element types here, only element bit width. + EltTy = IntegerType::getIntNTy(EltTy->getContext(), EltTyBits); auto bailout = [&]() { return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, @@ -3639,14 +3641,16 @@ if (!ST->hasAVX512()) return bailout(); + // Do we have a native shuffle for this element type, or should we promote? + unsigned PromEltTyBits = EltTyBits; switch (EltTyBits) { case 32: case 64: break; // AVX512F. case 16: if (!ST->hasBWI()) - return bailout(); - break; + PromEltTyBits = 32; // promote to i32, AVX512F. + break; // AVX512BW case 8: if (!ST->hasVBMI()) return bailout(); @@ -3654,19 +3658,42 @@ default: return bailout(); } + auto *PromEltTy = IntegerType::getIntNTy(EltTy->getContext(), PromEltTyBits); auto *SrcVecTy = FixedVectorType::get(EltTy, VF); + auto *PromSrcVecTy = FixedVectorType::get(PromEltTy, VF); + int NumDstElements = VF * ReplicationFactor; + auto *PromDstVecTy = FixedVectorType::get(PromEltTy, NumDstElements); auto *DstVecTy = FixedVectorType::get(EltTy, NumDstElements); // Legalize the types. MVT LegalSrcVecTy = TLI->getTypeLegalizationCost(DL, SrcVecTy).second; + MVT LegalPromSrcVecTy = TLI->getTypeLegalizationCost(DL, PromSrcVecTy).second; + MVT LegalPromDstVecTy = TLI->getTypeLegalizationCost(DL, PromDstVecTy).second; MVT LegalDstVecTy = TLI->getTypeLegalizationCost(DL, DstVecTy).second; - - // They both should have legalized into vector types. - if (!LegalSrcVecTy.isVector() || !LegalDstVecTy.isVector()) + // They should have legalized into vector types. + if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || + !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) return bailout(); + if (PromEltTyBits != EltTyBits) { + // If we have to perform the shuffle with wider elt type than our data type, + // then we will first need to anyext (we don't care about the new bits) + // the source elements, and then truncate Dst elements. + InstructionCost PromotionCost; + PromotionCost += getCastInstrCost( + Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + PromotionCost += + getCastInstrCost(Instruction::Trunc, /*Dst=*/DstVecTy, + /*Src=*/PromDstVecTy, + TargetTransformInfo::CastContextHint::None, CostKind); + return PromotionCost + getReplicationShuffleCost(PromEltTy, + ReplicationFactor, VF, + DemandedDstElts, CostKind); + } + assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && "We expect that the legalization doesn't affect the element width, " @@ -3678,6 +3705,10 @@ auto *SingleDstVecTy = FixedVectorType::get(EltTy, NumEltsPerDstVec); + // Not all the produced Dst elements may be demanded. In our case, + // given that a single Dst vector is formed by a single shuffle, + // if all elements that will form a single Dst vector aren't demanded, + // then we won't need to do that shuffle, so adjust the cost accordingly. APInt DemandedDstVectors = APIntOps::ScaleBitMask( DemandedDstElts.zextOrSelf(NumDstVectors * NumEltsPerDstVec), NumDstVectors); diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll --- a/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/shuffle-replication-i16.ll @@ -74,12 +74,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride2' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <4 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <128 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride2' @@ -165,12 +165,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride3' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <3 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 164 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <6 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <192 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride3' @@ -256,12 +256,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride4' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 200 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <8 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <256 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride4' @@ -347,12 +347,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride5' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <5 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 114 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 236 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 472 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <10 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <20 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <40 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <80 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <160 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <320 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride5' @@ -438,12 +438,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride6' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <6 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <12 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <24 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <48 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <96 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <192 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <384 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride6' @@ -529,12 +529,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride7' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <7 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 308 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 616 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <14 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <28 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <56 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <112 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <224 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <448 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride7' @@ -620,12 +620,12 @@ ; ; AVX512F-LABEL: 'replication_i16_stride8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %vf1 = shufflevector <1 x i16> undef, <1 x i16> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 344 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> -; AVX512F-NEXT: Cost Model: Found an estimated cost of 688 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vf2 = shufflevector <2 x i16> undef, <2 x i16> poison, <16 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %vf4 = shufflevector <4 x i16> undef, <4 x i16> poison, <32 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %vf8 = shufflevector <8 x i16> undef, <8 x i16> poison, <64 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %vf16 = shufflevector <16 x i16> undef, <16 x i16> poison, <128 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %vf32 = shufflevector <32 x i16> undef, <32 x i16> poison, <256 x i32> +; AVX512F-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %vf64 = shufflevector <64 x i16> undef, <64 x i16> poison, <512 x i32> ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; AVX512BW-LABEL: 'replication_i16_stride8'