Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7359,16 +7359,36 @@ SDValue V2, ArrayRef Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + // Determine if inputs are safe splats (no undefs). + auto isSafeSplat = [](SDValue V) { + BitVector UndefElements; + if (auto *BVOp = dyn_cast(V.getNode())) { + if (BVOp->getConstantSplatNode(&UndefElements) && UndefElements.none()) + return true; + if (BVOp->getConstantFPSplatNode(&UndefElements) && UndefElements.none()) + return true; + } + if (auto *SVNOp = dyn_cast(V.getNode())) { + ArrayRef SVNMask = SVNOp->getMask(); + for (int i = 0, Size = SVNMask.size(); i < Size; ++i) + if (SVNMask[i] < 0 || SVNMask[i] != SVNMask[0]) + return false; + return true; + } + return false; + }; + bool IsV1Splat = isSafeSplat(V1); + bool IsV2Splat = isSafeSplat(V2); unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] >= Size) { - if (Mask[i] != i + Size) + if (Mask[i] != i + Size && !IsV2Splat) return SDValue(); // Shuffled V2 input! BlendMask |= 1u << i; continue; } - if (Mask[i] >= 0 && Mask[i] != i) + if (Mask[i] >= 0 && Mask[i] != i && !IsV1Splat) return SDValue(); // Shuffled V1 input! } switch (VT.SimpleTy) { Index: test/CodeGen/X86/combine-or.ll =================================================================== --- test/CodeGen/X86/combine-or.ll +++ test/CodeGen/X86/combine-or.ll @@ -206,10 +206,10 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: test17: ; CHECK: # BB#0: -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,0] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] -; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,2] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3] ; CHECK-NEXT: orps %xmm1, %xmm2 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: retq Index: test/CodeGen/X86/vector-blend.ll =================================================================== --- test/CodeGen/X86/vector-blend.ll +++ test/CodeGen/X86/vector-blend.ll @@ -799,3 +799,46 @@ %select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %select } + +; Blend with Splat + +define <4 x float> @blend_splat_float(<4 x float> %v1, <4 x float> %v2) { +; SSE2-LABEL: blend_splat_float: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: blend_splat_float: +; SSSE3: # BB#0: # %entry +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,3] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: blend_splat_float: +; SSE41: # BB#0: # %entry +; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; SSE41-NEXT: retq +; +; AVX1-LABEL: blend_splat_float: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: blend_splat_float: +; AVX2: # BB#0: # %entry +; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: retq +entry: + %splat = shufflevector <4 x float> %v2, <4 x float> undef, <4 x i32> + %select = shufflevector <4 x float> %v1, <4 x float> %splat, <4 x i32> + ret <4 x float> %select +} +