Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6948,6 +6948,26 @@ /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + // If this is a splat of a pair of elements, use MOVDDUP (unless the target + // has XOP; in that case defer lowering to potentially use VPERMIL2PS). + // Because we're creating a less complicated build vector here, we may enable + // further folding of the MOVDDUP via shuffle transforms. + if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && + Op.getOperand(0) == Op.getOperand(2) && + Op.getOperand(1) == Op.getOperand(3) && + Op.getOperand(0) != Op.getOperand(1)) { + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + MVT EltVT = VT.getVectorElementType(); + // Create a new build vector with the first 2 elements followed by undef + // padding, bitcast to v2f64, duplicate, and bitcast back. + SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), + DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; + SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); + SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); + return DAG.getBitcast(VT, Dup); + } + // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1956,12 +1956,9 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { ; X86-LABEL: test_mm256_set1_epi64x: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: vmovd %ecx, %xmm0 -; X86-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X86-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; Index: test/CodeGen/X86/avx-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx-vbroadcast.ll +++ test/CodeGen/X86/avx-vbroadcast.ll @@ -6,12 +6,8 @@ ; X32-LABEL: A: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; @@ -31,17 +27,19 @@ define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { ; X32-LABEL: A2: ; X32: ## %bb.0: ## %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %ecx -; X32-NEXT: movl %ecx, 4(%eax) +; X32-NEXT: movl 4(%ecx), %esi +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movl %edx, (%eax) -; X32-NEXT: vmovd %edx, %xmm0 -; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: A2: @@ -592,12 +590,8 @@ ; X32-LABEL: G: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: retl ; ; X64-LABEL: G: @@ -615,16 +609,18 @@ define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { ; X32-LABEL: G2: ; X32: ## %bb.0: ## %entry +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %ecx -; X32-NEXT: movl %ecx, 4(%eax) +; X32-NEXT: movl 4(%ecx), %esi +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movl %edx, (%eax) -; X32-NEXT: vmovd %edx, %xmm0 -; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: G2: @@ -879,11 +875,11 @@ ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill +; X32-NEXT: vmovss %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload +; X32-NEXT: vsubss {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) ; X32-NEXT: addl $40, %esp @@ -896,11 +892,11 @@ ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill +; X64-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill ; X64-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload +; X64-NEXT: vsubss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 ## 4-byte Folded Reload ; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq %1 = alloca <4 x float>, align 16 Index: test/CodeGen/X86/build-vector-128.ll =================================================================== --- test/CodeGen/X86/build-vector-128.ll +++ test/CodeGen/X86/build-vector-128.ll @@ -527,39 +527,27 @@ ; SSE41-32-LABEL: PR37502: ; SSE41-32: # %bb.0: ; SSE41-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-32-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-32-NEXT: retl ; ; SSE41-64-LABEL: PR37502: ; SSE41-64: # %bb.0: -; SSE41-64-NEXT: movaps %xmm0, %xmm2 -; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[2,3] -; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0],xmm2[3] -; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[0] -; SSE41-64-NEXT: movaps %xmm2, %xmm0 +; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-64-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE41-64-NEXT: retq ; -; AVX1-32-LABEL: PR37502: -; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; AVX1-32-NEXT: retl +; AVX-32-LABEL: PR37502: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX-32-NEXT: retl ; ; AVX1-64-LABEL: PR37502: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[2,3] -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3] -; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1-64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-64-NEXT: retq ; -; AVX2-32-LABEL: PR37502: -; AVX2-32: # %bb.0: -; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX2-32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX2-32-NEXT: retl -; ; AVX2-64-LABEL: PR37502: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3979,12 +3979,11 @@ ; ; X86-AVX1-LABEL: test_mm_set1_epi64x: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X86-AVX1-NEXT: vmovd %ecx, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc1] -; X86-AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01] -; X86-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02] -; X86-AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x03] +; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04] +; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero +; X86-AVX1-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01] +; X86-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44] +; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_set1_epi64x: