Index: include/llvm/IR/Instructions.h =================================================================== --- include/llvm/IR/Instructions.h +++ include/llvm/IR/Instructions.h @@ -2237,6 +2237,12 @@ return Mask; } + /// Determine if the shuffle mask is a splat, possibly with undefined mask + /// indices as well. Returns true if the same shuffle index is found in all + /// defined elements and optionally returns the splat index. Returns false + /// if the mask is not a splat or all mask indices are undefined. + bool isSplat(int *SplatIndex = nullptr) const; + // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Instruction *I) { return I->getOpcode() == Instruction::ShuffleVector; Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -516,6 +516,10 @@ SmallVector Mask = Shuffle->getShuffleMask(); if (NumVecElems == Mask.size()) { + int BroadcastIndex = -1; + if (Shuffle->isSplat(&BroadcastIndex) && BroadcastIndex == 0) + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, + VecTypOp0, 0, nullptr); if (isReverseVectorMask(Mask)) return TTI->getShuffleCost(TargetTransformInfo::SK_Reverse, VecTypOp0, 0, nullptr); Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -4885,18 +4885,6 @@ return true; } -static bool isBroadcastShuffle(ShuffleVectorInst *SVI) { - SmallVector Mask(SVI->getShuffleMask()); - int SplatElem = -1; - for (unsigned i = 0; i < Mask.size(); ++i) { - if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem) - return false; - SplatElem = Mask[i]; - } - - return true; -} - /// Some targets have expensive vector shifts if the lanes aren't all the same /// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases /// it's often worth sinking a shufflevector splat down to its use so that @@ -4910,7 +4898,7 @@ // We only expect better codegen by sinking a shuffle if we can recognise a // constant splat. - if (!isBroadcastShuffle(SVI)) + if (!SVI->isSplat()) return false; // InsertedShuffles - Only insert a shuffle in each block once. Index: lib/IR/Instructions.cpp =================================================================== --- lib/IR/Instructions.cpp +++ lib/IR/Instructions.cpp @@ -1894,6 +1894,23 @@ } } +bool ShuffleVectorInst::isSplat(int *SplatIndex /* = nullptr */) const { + SmallVector Mask; + getShuffleMask(Mask); + + int SplatElem = -1; + for (int M : Mask) { + if (M < 0) + continue; + if (0 <= SplatElem && SplatElem != M) + return false; + SplatElem = M; + } + + if (0 <= SplatElem && SplatIndex) + *SplatIndex = SplatElem; + return (0 <= SplatElem); +} //===----------------------------------------------------------------------===// // InsertValueInst Class Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -598,11 +598,18 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate) { + if (Kind == TTI::SK_Reverse || Kind == TTI::SK_Alternate || + Kind == TTI::SK_Broadcast) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. std::pair LT = TLI->getTypeLegalizationCost(DL, Tp); + // For Broadcasts we are splatting the first element from the first input + // register, so only need to reference that input and all the output + // registers are the same. + if (Kind == TTI::SK_Broadcast) + LT.first = 1; + static const CostTblEntry AVX512VBMIShuffleTbl[] = { { TTI::SK_Reverse, MVT::v64i8, 1 }, // vpermb { TTI::SK_Reverse, MVT::v32i8, 1 } // vpermb @@ -614,10 +621,13 @@ return LT.first * Entry->Cost; static const CostTblEntry AVX512BWShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw - { TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128 - // + 2*pshufb + vinserti64x4 + { TTI::SK_Broadcast, MVT::v32i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v64i8, 1 }, // vpbroadcastb + + { TTI::SK_Reverse, MVT::v32i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v16i16, 1 }, // vpermw + { TTI::SK_Reverse, MVT::v64i8, 6 } // vextracti64x4 + 2*vperm2i128 + // + 2*pshufb + vinserti64x4 }; if (ST->hasBWI()) @@ -626,10 +636,15 @@ return LT.first * Entry->Cost; static const CostTblEntry AVX512ShuffleTbl[] = { - { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd - { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps - { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq - { TTI::SK_Reverse, MVT::v16i32, 1 }, // vpermd + { TTI::SK_Broadcast, MVT::v8f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v16f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v8i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v16i32, 1 }, // vpbroadcastd + + { TTI::SK_Reverse, MVT::v8f64, 1 }, // vpermpd + { TTI::SK_Reverse, MVT::v16f32, 1 }, // vpermps + { TTI::SK_Reverse, MVT::v8i64, 1 }, // vpermq + { TTI::SK_Reverse, MVT::v16i32, 1 } // vpermd }; if (ST->hasAVX512()) @@ -638,6 +653,13 @@ return LT.first * Entry->Cost; static const CostTblEntry AVX2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 1 }, // vbroadcastpd + { TTI::SK_Broadcast, MVT::v8f32, 1 }, // vbroadcastps + { TTI::SK_Broadcast, MVT::v4i64, 1 }, // vpbroadcastq + { TTI::SK_Broadcast, MVT::v8i32, 1 }, // vpbroadcastd + { TTI::SK_Broadcast, MVT::v16i16, 1 }, // vpbroadcastw + { TTI::SK_Broadcast, MVT::v32i8, 1 }, // vpbroadcastb + { TTI::SK_Reverse, MVT::v4f64, 1 }, // vpermpd { TTI::SK_Reverse, MVT::v8f32, 1 }, // vpermps { TTI::SK_Reverse, MVT::v4i64, 1 }, // vpermq @@ -654,6 +676,13 @@ return LT.first * Entry->Cost; static const CostTblEntry AVX1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd + { TTI::SK_Broadcast, MVT::v8i32, 2 }, // vperm2f128 + vpermilps + { TTI::SK_Broadcast, MVT::v16i16, 3 }, // vpshuflw + vpshufd + vinsertf128 + { TTI::SK_Broadcast, MVT::v32i8, 2 }, // vpshufb + vinsertf128 + { TTI::SK_Reverse, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd { TTI::SK_Reverse, MVT::v8f32, 2 }, // vperm2f128 + vpermilps { TTI::SK_Reverse, MVT::v4i64, 2 }, // vperm2f128 + vpermilpd @@ -689,6 +718,9 @@ return LT.first * Entry->Cost; static const CostTblEntry SSSE3ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v8i16, 1 }, // pshufb + { TTI::SK_Broadcast, MVT::v16i8, 1 }, // pshufb + { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb @@ -701,6 +733,12 @@ return LT.first * Entry->Cost; static const CostTblEntry SSE2ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd + { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd + { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd + { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd + { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd @@ -720,6 +758,7 @@ return LT.first * Entry->Cost; static const CostTblEntry SSE1ShuffleTbl[] = { + { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps }; Index: test/Analysis/CostModel/X86/shuffle-broadcast.ll =================================================================== --- test/Analysis/CostModel/X86/shuffle-broadcast.ll +++ test/Analysis/CostModel/X86/shuffle-broadcast.ll @@ -18,14 +18,150 @@ %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer ; SSE: cost of 1 {{.*}} %V256 = shufflevector - ; AVX: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer ; SSE: cost of 1 {{.*}} %V512 = shufflevector - ; AVX: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector ; AVX512: cost of 1 {{.*}} %V512 = shufflevector %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer ret void } + +; CHECK-LABEL: 'test_vXi64' +define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) { + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <2 x i64> %src128, <2 x i64> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXf32' +define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x float> %src64, <2 x float> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi32' +define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) { + ; SSE: cost of 1 {{.*}} %V64 = shufflevector + ; AVX: cost of 1 {{.*}} %V64 = shufflevector + ; AVX512: cost of 1 {{.*}} %V64 = shufflevector + %V64 = shufflevector <2 x i32> %src64, <2 x i32> undef, <2 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> zeroinitializer + + ; SSE: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi16' +define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) { + ; SSE2: cost of 2 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 3 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 2 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 3 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> zeroinitializer + + ret void +} + +; CHECK-LABEL: 'test_vXi8' +define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { + ; SSE2: cost of 3 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512: cost of 1 {{.*}} %V128 = shufflevector + %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V256 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V256 = shufflevector + ; SSE42: cost of 1 {{.*}} %V256 = shufflevector + ; AVX1: cost of 2 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector + ; AVX512: cost of 1 {{.*}} %V256 = shufflevector + %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> zeroinitializer + + ; SSE2: cost of 3 {{.*}} %V512 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V512 = shufflevector + ; SSE42: cost of 1 {{.*}} %V512 = shufflevector + ; AVX1: cost of 2 {{.*}} %V512 = shufflevector + ; AVX2: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V512 = shufflevector + ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector + %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> zeroinitializer + + ret void +}