Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -90,6 +90,13 @@ return false; } +static bool isBroadcastVectorMask(SmallVectorImpl &Mask) { + for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i) + if (Mask[i] > 0) + return false; + return true; +} + static bool isReverseVectorMask(SmallVectorImpl &Mask) { for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i) if (Mask[i] >= 0 && Mask[i] != (int)(MaskSize - 1 - i)) @@ -495,6 +502,9 @@ SmallVector Mask = Shuffle->getShuffleMask(); if (NumVecElems == Mask.size()) { + if (isBroadcastVectorMask(Mask)) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); if (isReverseVectorMask(Mask)) return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0, nullptr); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,9 +598,93 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - // We only estimate the cost of reverse and alternate shuffles. - if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (Kind == TTI::SK_Broadcast) { + // Broadcast is a special case where when we've legalized to multiple + // registers we will just repeatedly use the first register. + std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + + static const CostTblEntry AVX512BWShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v32i16, 1 }, // vpbroadcastw + { ISD::VECTOR_SHUFFLE, MVT::v64i8, 1 } // vpbroadcastb + }; + + if (ST->hasBWI()) + if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX512ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8f64, 1 }, // vbroadcastpd + { ISD::VECTOR_SHUFFLE, MVT::v16f32, 1 }, // vbroadcastps + { ISD::VECTOR_SHUFFLE, MVT::v8i64, 1 }, // vpbroadcastq + { ISD::VECTOR_SHUFFLE, MVT::v16i32, 1 } // vpbroadcastd + }; + + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 1 }, // vbroadcastpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 1 }, // vbroadcastps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 1 }, // vpbroadcastq + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 1 }, // vpbroadcastd + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 1 }, // vpbroadcastw + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 1 } // vpbroadcastb + }; + + if (ST->hasAVX2()) + if (const auto *Entry = + CostTableLookup(AVX2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry AVX1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { ISD::VECTOR_SHUFFLE, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { ISD::VECTOR_SHUFFLE, MVT::v16i16, 3 }, // pshuflw + pshufd + vinsertf128 + { ISD::VECTOR_SHUFFLE, MVT::v32i8, 2 } // pshufb + vinsertf128 + }; + + if (ST->hasAVX()) + if (const auto *Entry = + CostTableLookup(AVX1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSSE3ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 1 }, // pshufb + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 1 } // pshufb + }; + + if (ST->hasSSSE3()) + if (const auto *Entry = + CostTableLookup(SSSE3ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSE2ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 }, // shufpd + { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v4i32, 1 }, // pshufd + { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 }, // pshuflw + pshufd + { ISD::VECTOR_SHUFFLE, MVT::v16i8, 3 } // unpck + pshuflw + pshufd + }; + + if (ST->hasSSE2()) + if (const auto *Entry = + CostTableLookup(SSE2ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + + static const CostTblEntry SSE1ShuffleTbl[] = { + { ISD::VECTOR_SHUFFLE, MVT::v4f32, 1 }, // shufps + }; + + if (ST->hasSSE1()) + if (const auto *Entry = + CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) + return Entry->Cost; + } if (Kind == TTI::SK_Reverse) { std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); @@ -792,7 +876,6 @@ if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second)) return LT.first * Entry->Cost; - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); Index: test/Analysis/CostModel/X86/shuffle-broadcast.ll =================================================================== --- test/Analysis/CostModel/X86/shuffle-broadcast.ll +++ test/Analysis/CostModel/X86/shuffle-broadcast.ll @@ -9,23 +9,161 @@ ; ; Verify the cost model for broadcast shuffles. ; +; Broadcast is special in that after legalization we can reuse the first broadcasted register. +; ; CHECK-LABEL: 'test_vXf64' define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) { - ; SSE: Unknown cost {{.*}} %V128 = shufflevector - ; AVX: Unknown cost {{.*}} %V128 = shufflevector - ; AVX512: Unknown cost {{.*}} %V128 = shufflevector + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V256 = shufflevector - ; AVX: Unknown cost {{.*}} %V256 = shufflevector - ; AVX512: Unknown cost {{.*}} %V256 = shufflevector + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer - ; SSE: Unknown cost {{.*}} %V512 = shufflevector - ; AVX: Unknown cost {{.*}} %V512 = shufflevector - ; AVX512: Unknown cost {{.*}} %V512 = shufflevector + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer ret void } + +; CHECK-LABEL: 'test_vXi64' +define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi32' +define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) { + ; SSE2: cost of 2 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 3 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { + ; SSE2: cost of 3 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer + + ret void +}