diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12612,6 +12612,23 @@ return SDValue(); } +static bool isBroadcastShuffleMask(ArrayRef Mask) { + return isUndefOrEqual(Mask, 0); +} + +static bool isNoopOrBroadcastShuffleMask(ArrayRef Mask) { + return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask); +} + +static SDValue getSplatOfVectorElement(const SDLoc &DL, SDValue Vec, int EltIdx, + SelectionDAG &DAG) { + EVT VT = Vec.getValueType(); + SDValue ScalarElt = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT.getScalarType(), Vec, + DAG.getIntPtrConstant(EltIdx, DL)); + return DAG.getSplatBuildVector(VT, DL, ScalarElt); +} + /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// @@ -12621,7 +12638,8 @@ /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. static SDValue lowerShuffleAsDecomposedShuffleMerge( const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { + const X86Subtarget &Subtarget, SelectionDAG &DAG, + bool BlendOfBroadcastsAndOrIdentitiesOnly = false) { int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumEltsPerLane = NumElts / NumLanes; @@ -12635,16 +12653,46 @@ for (int i = 0; i < NumElts; ++i) { int M = Mask[i]; if (M >= 0 && M < NumElts) { + // FIXME: pick 0'th element if V1 is a splat? V1Mask[i] = M; FinalMask[i] = i; IsAlternating &= (i & 1) == 0; } else if (M >= NumElts) { + // FIXME: pick 0'th element if V2 is a splat? V2Mask[i] = M - NumElts; FinalMask[i] = i + NumElts; IsAlternating &= (i & 1) == 1; } } + auto canonicalizeBroadcastableInput = + [DL, &Subtarget, &DAG](SDValue &Input, MutableArrayRef InputMask) { + unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); + if (!Subtarget.hasAVX2() && + (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input))) + return; + if (!isBroadcastShuffleMask(InputMask)) + return; + Input = getSplatOfVectorElement(DL, Input, 0, DAG); + for (auto I : enumerate(InputMask)) { + int &InputMaskElt = I.value(); + if (InputMaskElt >= 0) + InputMaskElt = I.index(); + } + }; + + if (isNoopOrBroadcastShuffleMask(V1Mask) && + isNoopOrBroadcastShuffleMask(V2Mask)) { + canonicalizeBroadcastableInput(V1, V1Mask); + canonicalizeBroadcastableInput(V2, V2Mask); + } + + if (BlendOfBroadcastsAndOrIdentitiesOnly) { + if (isNoopShuffleMask(V1Mask) && isNoopShuffleMask(V2Mask)) + return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); + return SDValue(); + } + // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when @@ -12693,6 +12741,14 @@ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } +static SDValue lowerShuffleAsBlendWithBroadcast( + const SDLoc &DL, ArrayRef Mask, const APInt &Zeroable, MVT VT, + SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + return lowerShuffleAsDecomposedShuffleMerge( + DL, VT, V1, V2, Mask, Subtarget, DAG, + /*BlendOfBroadcastsAndOrIdentitiesOnly=*/true); +} + /// Try to lower a vector shuffle as a bit rotation. /// /// Look for a repeated rotation pattern in each sub group. @@ -17128,6 +17184,8 @@ DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) return V; + // FIXME: we should run `lowerShuffleAsBlendWithBroadcast()` here. + // Try to permute the lanes and then use a per-lane permute. if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, Mask, DAG, Subtarget)) @@ -17151,6 +17209,12 @@ Zeroable, Subtarget, DAG)) return Op; + // See if this shuffle can be represented as a broadcast of 0'th element + // of some input, and a blend between said broadcast and an input. + if (SDValue Blend = lowerShuffleAsBlendWithBroadcast( + DL, Mask, Zeroable, MVT::v4f64, V1, V2, Subtarget, DAG)) + return Blend; + // If we have lane crossing shuffles AND they don't all come from the lower // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently @@ -17273,6 +17337,12 @@ if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) return V; + // See if this shuffle can be represented as a broadcast of 0'th element + // of some input, and a blend between said broadcast and an input. + if (SDValue Blend = lowerShuffleAsBlendWithBroadcast( + DL, Mask, Zeroable, MVT::v4i64, V1, V2, Subtarget, DAG)) + return Blend; + // If we have one input in place, then we can permute the other input and // blend the result. if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask)) diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind { ; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2] +; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> ret <4 x double> %r @@ -741,7 +741,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -763,8 +763,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -800,7 +799,7 @@ ; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -268,11 +268,11 @@ ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -289,11 +289,11 @@ ; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq %9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> %10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> @@ -463,8 +463,7 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq @@ -1082,28 +1081,28 @@ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-SLOW-NEXT: retq ; ; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32: ; AVX1-FAST: # %bb.0: @@ -1122,6 +1121,28 @@ ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-FAST-NEXT: retq ; +; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq +; ; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] @@ -1134,7 +1155,7 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,2] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -1677,18 +1677,44 @@ } define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) { -; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vbroadcastsd (%eax), %ymm1 -; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; X86-NEXT: retl +; X86-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X86-AVX1-NEXT: retl ; -; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: -; X64: # %bb.0: -; X64-NEXT: vbroadcastsd (%rdi), %ymm1 -; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] -; X64-NEXT: retq +; X86-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX2-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X86-AVX2-NEXT: retl +; +; X86-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vbroadcastsd (%eax), %ymm1 +; X86-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X86-AVX512-NEXT: retl +; +; X64-AVX1-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X64-AVX2-NEXT: retq +; +; X64-AVX512-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcastsd (%rdi), %ymm1 +; X64-AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] +; X64-AVX512-NEXT: retq %vec = load <2 x float>, <2 x float>* %vp %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> %res = select <8 x i1> , <8 x float> %shuf, <8 x float> %default diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -701,27 +701,16 @@ ; ; AVX2-LABEL: shuffle_v4f64_0044: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: retq ; -; AVX512VL-SLOW-LABEL: shuffle_v4f64_0044: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-ALL-LABEL: shuffle_v4f64_0044: -; AVX512VL-FAST-ALL: # %bb.0: -; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] -; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 -; AVX512VL-FAST-ALL-NEXT: retq -; -; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4f64_0044: -; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX512VL-FAST-PERLANE-NEXT: retq +; AVX512VL-LABEL: shuffle_v4f64_0044: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %1 } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -468,11 +468,11 @@ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X86-AVX2-NEXT: vmovapd %ymm3, (%edx) ; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3] @@ -495,8 +495,8 @@ ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] +; X86-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3 +; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3] ; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X86-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] ; X86-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,0,3,0,8,0,1,0] @@ -538,11 +538,11 @@ ; ; X64-AVX2-LABEL: PR48908: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 +; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3 ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX2-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1] +; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1] ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3] ; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi) ; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2],ymm5[3] @@ -562,8 +562,8 @@ ; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 -; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[2] +; X64-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3 +; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3] ; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1] ; X64-AVX512-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm4[0],ymm1[2],ymm4[3] ; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm5 = [0,3,8,1]