diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16035,6 +16035,140 @@ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); } +static SDValue lowerShuffleAsBlendWithBroadcast( + const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, MVT VT, + SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumElts = VT.getVectorNumElements(); + assert(Mask.size() == NumElts && "Mask size mismatch?"); + + // The scalar element type must be legal for us to succeed. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT.getScalarType())) + return SDValue(); + + // Don't get stuck reprocessing the broadcast. + if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) + return SDValue(); + + // Can we theoretically broadcast this element type? + // Note that for AVX1 we must be able to fold the load! + if (!((Subtarget.hasAVX2() && EltSizeInBits != 8) || + (Subtarget.hasAVX() && EltSizeInBits >= 32))) + return SDValue(); + + enum class EltStatus { Undef, Blendable, BroadcastCandidate }; + SmallVector DecipheredMask; + DecipheredMask.reserve(Mask.size()); + transform(enumerate(Mask), std::back_inserter(DecipheredMask), + [NumElts](auto I) { + unsigned EltIdx = I.index(); + int MaskElt = I.value(); + + assert(isUndefOrInRange(MaskElt, 0, 2 * NumElts) && + "Malformed shuffle?"); + + if (MaskElt < 0) + return EltStatus::Undef; + if ((unsigned)MaskElt == EltIdx || + (unsigned)MaskElt == NumElts + EltIdx) + return EltStatus::Blendable; + // FIXME: what if inputs are broadcasts? + return EltStatus::BroadcastCandidate; + }); + assert(DecipheredMask.size() == Mask.size() && "Mask size shouldn't change."); + + // If this already looks like a blend, then don't bother any further. + if (!is_contained(DecipheredMask, EltStatus::BroadcastCandidate)) + return SDValue(); + + // Are all non-blendable elements picking the same scalar value? + Optional CandidateMaskElt; + for (auto I : zip(DecipheredMask, Mask)) { + EltStatus Dsc = std::get<0>(I); + int MaskElt = std::get<1>(I); + + if (Dsc != EltStatus::BroadcastCandidate) + continue; + + // FIXME: what if inputs are broadcasts? + if (!CandidateMaskElt) + CandidateMaskElt = MaskElt; + else if (*CandidateMaskElt != (unsigned)MaskElt) + return SDValue(); + } + assert(CandidateMaskElt && + "Should have found a single non-blendable mask element!"); + + int ToBeBroadcastedInputIdx = *CandidateMaskElt / NumElts; + int ToBeBroadcastedInputEltIdx = *CandidateMaskElt % NumElts; + + // We can only freely broadcast the 0'th element, so let's abort otherwise. + if (ToBeBroadcastedInputEltIdx != 0) + return SDValue(); + + // Strike-out all CandidateMaskElt mask elements. + SmallVector AdjustedMask(Mask.begin(), Mask.end()); + for (auto I : zip(DecipheredMask, AdjustedMask)) { + EltStatus &Dsc = std::get<0>(I); + int &MaskElt = std::get<1>(I); + + if ((unsigned)MaskElt == *CandidateMaskElt) { + Dsc = EltStatus::BroadcastCandidate; + MaskElt = -1; + } + } + + // The remaining mask needs to be a single-source mask. + if (!ShuffleVectorInst::isSingleSourceMask(AdjustedMask)) + return SDValue(); + + assert(is_contained(DecipheredMask, EltStatus::Blendable) && + "Whole shuffle simplified into a broadcast?"); + + assert(ShuffleVectorInst::isIdentityMask(AdjustedMask) && + "By now the remaining mask can only be an identity mask."); + + std::array Inputs = {V1, V2}; + + // From which input does the to-be-broadcasted element comes? + SDValue ToBeBroadcastedInput = Inputs[ToBeBroadcastedInputIdx]; + + // Also, if we don't have AVX2 broadcast-from-reg, + // we must be able to fold the load. + if (!Subtarget.hasAVX2() && !MayFoldLoad(ToBeBroadcastedInput)) + return SDValue(); + + // Which input do we keep as baseline identity? + int IdentityInputIdx = *find_if(AdjustedMask, [](int MaskElt) { + return MaskElt != -1; + }) / NumElts; + SDValue IdentityInput = Inputs[IdentityInputIdx]; + + // Okay, this can be represented as blend with broadcast! + SDValue ScalarElt = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, VT.getScalarType(), ToBeBroadcastedInput, + DAG.getIntPtrConstant(ToBeBroadcastedInputEltIdx, DL)); + SDValue InputSplatVec = DAG.getSplatBuildVector(VT, DL, ScalarElt); + SmallVector BlendMask; + BlendMask.reserve(NumElts); + transform(enumerate(DecipheredMask), std::back_inserter(BlendMask), + [NumElts](auto I) -> int { + int EltIdx = I.index(); + EltStatus Dsc = I.value(); + + switch (Dsc) { + case EltStatus::Undef: + return -1; + case EltStatus::BroadcastCandidate: + return EltIdx + NumElts; + default: + return EltIdx; + } + }); + assert(BlendMask.size() == Mask.size() && "Mask size shouldn't change."); + return DAG.getVectorShuffle(VT, DL, IdentityInput, InputSplatVec, BlendMask); +} + /// Either split a vector in halves or decompose the shuffles and the /// blend/unpack. /// @@ -17128,6 +17262,8 @@ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; + // FIXME: we should run `lowerShuffleAsBlendWithBroadcast()` here. + // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) @@ -17151,6 +17287,12 @@ Zeroable, Subtarget, DAG)) return Op; + // See if this shuffle can be represented as a broadcast of 0'th element + // of some input, and a blend between said blend and an input. + if (SDValue Blend = lowerShuffleAsBlendWithBroadcast( + DL, Mask, Zeroable, MVT::v4f64, V1, V2, Subtarget, DAG)) + return Blend; + // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently @@ -17273,6 +17415,12 @@ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + // See if this shuffle can be represented as a broadcast of 0'th element + // of some input, and a blend between said blend and an input. + if (SDValue Blend = lowerShuffleAsBlendWithBroadcast( + DL, Mask, Zeroable, MVT::v4i64, V1, V2, Subtarget, DAG)) + return Blend; + // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %r diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -268,11 +268,11 @@ ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -289,11 +289,11 @@ ; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -463,8 +463,7 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1677,18 +1677,44 @@ } define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) { -; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; X86-NEXT: retl +; X86-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: -; X64: # %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; X64-NEXT: retq +; X86-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X64-AVX512-NEXT: retq %vec = load <2 x float>, <2 x float>* %vp %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %default diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -468,11 +468,11 @@ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X86-AVX2-NEXT: vmovapd %ymm3, (%edx) ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3] @@ -495,8 +495,8 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] +; X86-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3 +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3] ; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0] @@ -538,11 +538,11 @@ ; ; X64-AVX2-LABEL: PR48908: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3] @@ -562,8 +562,8 @@ ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] +; X64-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3 +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3] ; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1]