diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4298,15 +4298,6 @@ (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4315,17 +4306,6 @@ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; - } // Use 128-bit blends for OptForSpeed since BLENDs have better throughput than @@ -4468,6 +4448,28 @@ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), + sub_xmm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -300,17 +300,6 @@ (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSrr (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), - sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDrr (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), - sub_xmm)>; } let Predicates = [UseSSE1] in { @@ -4328,6 +4317,19 @@ (MOVZPQILo2PQIrr VR128:$src)>; } +let Predicates = [UseAVX] in { + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -6355,17 +6357,6 @@ (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll --- a/llvm/test/CodeGen/X86/vec_extract-avx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll @@ -144,19 +144,17 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovups (%ecx), %xmm0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovdqu (%ecx), %xmm0 +; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X32-NEXT: vmovdqa %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2i64_4i64: ; X64: # %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-NEXT: vmovdqa %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x i64>, <2 x i64>* %in, align 8 @@ -198,19 +196,17 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovups (%ecx), %xmm0 -; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vmovdqu (%ecx), %xmm0 +; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X32-NEXT: vmovdqa %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: legal_vzmovl_2f64_4f64: ; X64: # %bb.0: -; X64-NEXT: vmovups (%rdi), %xmm0 -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vmovdqu (%rdi), %xmm0 +; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; X64-NEXT: vmovdqa %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %ld = load <2 x double>, <2 x double>* %in, align 8 diff --git a/llvm/test/CodeGen/X86/vector-extend-inreg.ll b/llvm/test/CodeGen/X86/vector-extend-inreg.ll --- a/llvm/test/CodeGen/X86/vector-extend-inreg.ll +++ b/llvm/test/CodeGen/X86/vector-extend-inreg.ll @@ -71,18 +71,17 @@ ; X32-AVX-NEXT: andl $-128, %esp ; X32-AVX-NEXT: subl $384, %esp # imm = 0x180 ; X32-AVX-NEXT: movl 40(%ebp), %ecx -; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0 -; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X32-AVX-NEXT: vpbroadcastq 32(%ebp), %ymm0 +; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: vmovaps %ymm1, (%esp) -; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) +; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp) ; X32-AVX-NEXT: leal (%ecx,%ecx), %eax ; X32-AVX-NEXT: andl $31, %eax ; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax @@ -101,14 +100,13 @@ ; X64-AVX-NEXT: andq $-128, %rsp ; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100 ; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi -; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3] -; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3] +; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %ymm1, (%rsp) -; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) +; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $15, %edi ; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax ; X64-AVX-NEXT: movq %rbp, %rsp diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1505,8 +1505,7 @@ ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> @@ -1987,8 +1986,7 @@ define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize { ; ALL-LABEL: shuffle_v4f64_0zzz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> ret <4 x double> %b @@ -1997,8 +1995,7 @@ define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize { ; ALL-LABEL: shuffle_v4i64_0zzz_optsize: ; ALL: # %bb.0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: retq %b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> ret <4 x i64> %b diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1973,8 +1973,7 @@ define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) { ; ALL-LABEL: shuffle_v8i64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> ret <8 x i64> %shuffle @@ -1983,8 +1982,7 @@ define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) { ; ALL-LABEL: shuffle_v8f64_0zzzzzzz: ; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> ret <8 x double> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -380,8 +380,7 @@ define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) { ; CHECK-LABEL: combine_pshufb_as_vzmovl_64: ; CHECK: # %bb.0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: ret{{[l|q]}} %1 = bitcast <4 x double> %a0 to <32 x i8> %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> )