Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -7168,6 +7168,10 @@ } // Patterns +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseAVX] in { let AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a @@ -7184,8 +7188,10 @@ // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>; - def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), - (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; + + // Move low f64 and clear high bits. + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, @@ -7199,14 +7205,19 @@ (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)), sub_xmm)>; - // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>; - + // These will incur an FP/int domain crossing penalty, but it may be the only + // way without AVX2. Do not add any complexity because we may be able to match + // more optimal patterns defined earlier in this file. + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>; def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>; } +// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or +// on targets where they have equal performance. These were changed to use +// blends because blends have better throughput on SandyBridge and Haswell, but +// movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseSSE41] in { // With SSE41 we can use blends for these patterns. def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -843,8 +843,9 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) { ; ALL-LABEL: insert_reg_and_zero_v4f64: ; ALL: # BB#0: -; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; ALL-NEXT: # kill: XMM0 XMM0 YMM0 +; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] ; ALL-NEXT: retq %v = insertelement <4 x double> undef, double %a, i32 0 %shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -133,8 +133,6 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -962,8 +960,6 @@ ; AVX2: # BB#0: ; AVX2-NEXT: movl $7, %eax ; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32>