Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -7174,6 +7174,7 @@ // MOVS{S,D} to the lower bits. def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; + // FIXME: Prefer a movss (smaller encoding) over a blendps. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), @@ -7183,9 +7184,13 @@ // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), - (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + (SUBREG_TO_REG (i32 0), (VMOVSSrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i64 0), (VMOVSDrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, @@ -7199,14 +7204,19 @@ (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; - + // These will incur an FP/int domain crossing penalty, but it may be the only + // way without AVX2. Do not add any complexity because we may be able to match + // more optimal patterns defined earlier in this file. + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSSrr (v4f32 (V_SET0)), + (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; + (SUBREG_TO_REG (i64 0), (VMOVSDrr (v2f64 (V_SET0)), + (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>; } +// FIXME: Prefer a movss over a blendps and a movsd over a blendpd +// (smaller encodings). let Predicates = [UseSSE41] in { // With SSE41 we can use blends for these patterns. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -843,6 +843,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # BB#0: +; ALL-NEXT: # kill: XMM0 XMM0 YMM0 ; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] ; ALL-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -133,8 +133,6 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -962,8 +960,6 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>