Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8117,6 +8117,32 @@ return LD; } + // If this is a splat of pairs of 32-bit elements, we can use a narrower + // build_vector and broadcast it. + // TODO: We could probably generalize this more. + if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { + SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), + DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; + auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef Ops) { + // Make sure all the even/odd operands match. + for (unsigned i = 2; i != NumElems; ++i) + if (Ops[i % 2] != Op.getOperand(i)) + return false; + return true; + }; + if (CanSplat(Op, NumElems, Ops)) { + MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; + MVT NarrowVT = MVT::getVectorVT(EltVT, 4); + // Create a new build vector and cast to v2i64/v2f64. + SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), + DAG.getBuildVector(NarrowVT, dl, Ops)); + // Broadcast from v2i64/v2f64 and cast to final VT. + MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2); + return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, + NewBV)); + } + } + // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.getSizeInBits() > 128) { Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -189,12 +189,7 @@ ; X32-LABEL: Q64: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vpbroadcastq (%eax), %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: Q64: @@ -212,13 +207,8 @@ ; X32-LABEL: QQ64: ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vbroadcastsd %xmm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: QQ64: @@ -1380,12 +1370,8 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm1 -; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vpbroadcastq %xmm1, %xmm1 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp @@ -1438,15 +1424,10 @@ ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %ymm0, (%esp) -; X32-NEXT: movl (%eax), %ecx -; X32-NEXT: movl 4(%eax), %eax -; X32-NEXT: vmovd %ecx, %xmm1 -; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vbroadcastsd %xmm1, %ymm1 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -485,16 +485,11 @@ define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { ; X32-LABEL: test_mm512_mask_set1_epi64: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: vmovd %edx, %xmm1 -; X32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 +; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; X32-NEXT: kmovw %eax, %k1 -; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm0 {%k1} +; X32-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_mask_set1_epi64: @@ -513,16 +508,11 @@ define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { ; X32-LABEL: test_mm512_maskz_set1_epi64: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movb {{[0-9]+}}(%esp), %al -; X32-NEXT: vmovd %edx, %xmm0 -; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X32-NEXT: kmovw %eax, %k1 -; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} +; X32-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: test_mm512_maskz_set1_epi64: Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -797,16 +797,11 @@ define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { ; X32-LABEL: test_mm256_mask_set1_epi64: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movb {{[0-9]+}}(%esp), %dl -; X32-NEXT: vmovd %ecx, %xmm1 -; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 -; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 -; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 -; X32-NEXT: kmovw %edx, %k1 -; X32-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_mask_set1_epi64: @@ -826,16 +821,11 @@ define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { ; X32-LABEL: test_mm256_maskz_set1_epi64: ; X32: # %bb.0: # %entry -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movb {{[0-9]+}}(%esp), %dl -; X32-NEXT: vmovd %ecx, %xmm0 -; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32-NEXT: kmovw %edx, %k1 -; X32-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_set1_epi64: Index: test/CodeGen/X86/broadcastm-lowering.ll =================================================================== --- test/CodeGen/X86/broadcastm-lowering.ll +++ test/CodeGen/X86/broadcastm-lowering.ll @@ -122,9 +122,7 @@ ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0 -; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %zmm0 ; X86-AVX512VLCDBW-NEXT: retl entry: %0 = icmp eq <8 x i32> %a, %b @@ -160,8 +158,7 @@ ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0 -; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero -; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %ymm0 ; X86-AVX512VLCDBW-NEXT: retl entry: %0 = icmp eq <8 x i32> %a, %b Index: test/CodeGen/X86/insertelement-shuffle.ll =================================================================== --- test/CodeGen/X86/insertelement-shuffle.ll +++ test/CodeGen/X86/insertelement-shuffle.ll @@ -103,14 +103,9 @@ ; X32_AVX256-NEXT: subl $8, %esp ; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32_AVX256-NEXT: vmovlps %xmm0, (%esp) -; X32_AVX256-NEXT: movl (%esp), %eax -; X32_AVX256-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32_AVX256-NEXT: vmovd %eax, %xmm0 -; X32_AVX256-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; X32_AVX256-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; X32_AVX256-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 -; X32_AVX256-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; X32_AVX256-NEXT: vmovdqa %ymm0, %ymm1 +; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32_AVX256-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32_AVX256-NEXT: vmovaps %ymm0, %ymm1 ; X32_AVX256-NEXT: movl %ebp, %esp ; X32_AVX256-NEXT: popl %ebp ; X32_AVX256-NEXT: retl Index: test/CodeGen/X86/vector-shuffle-combining-xop.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32 --check-prefix=X86AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X32 --check-prefix=X86AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64 --check-prefix=X64AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s --check-prefix=X64 --check-prefix=X64AVX2 declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone @@ -320,20 +320,35 @@ ; FIXME: Duplicated load in i686 define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { -; X32-LABEL: buildvector_v4f32_0404: -; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] -; X32-NEXT: vmovaps %xmm0, (%eax) -; X32-NEXT: retl +; X86AVX-LABEL: buildvector_v4f32_0404: +; X86AVX: # %bb.0: +; X86AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X86AVX-NEXT: vmovaps %xmm0, (%eax) +; X86AVX-NEXT: retl ; -; X64-LABEL: buildvector_v4f32_0404: -; X64: # %bb.0: -; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0] -; X64-NEXT: vmovaps %xmm0, (%rdi) -; X64-NEXT: retq +; X86AVX2-LABEL: buildvector_v4f32_0404: +; X86AVX2: # %bb.0: +; X86AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X86AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X86AVX2-NEXT: vmovapd %xmm0, (%eax) +; X86AVX2-NEXT: retl +; +; X64AVX-LABEL: buildvector_v4f32_0404: +; X64AVX: # %bb.0: +; X64AVX-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0] +; X64AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64AVX-NEXT: retq +; +; X64AVX2-LABEL: buildvector_v4f32_0404: +; X64AVX2: # %bb.0: +; X64AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X64AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64AVX2-NEXT: vmovapd %xmm0, (%rdi) +; X64AVX2-NEXT: retq %v0 = insertelement <4 x float> undef, float %a, i32 0 %v1 = insertelement <4 x float> %v0, float %b, i32 1 %v2 = insertelement <4 x float> %v1, float %a, i32 2