Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -723,8 +723,6 @@ (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; - def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; } let AddedComplexity = 20 in { @@ -5337,6 +5335,12 @@ } } // isCodeGenOnly, SchedRW +let Predicates = [UseSSE2] in { + let Predicates = [NoSSE41], AddedComplexity = 15 in + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (PSHUFDri (MOVZPQILo2PQIrr $src), 0xA8)>; +} + let AddedComplexity = 20 in { let Predicates = [UseAVX] in { def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), Index: test/CodeGen/X86/lower-vec-shift-2.ll =================================================================== --- test/CodeGen/X86/lower-vec-shift-2.ll +++ test/CodeGen/X86/lower-vec-shift-2.ll @@ -25,9 +25,9 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: test2: ; SSE2: # BB#0 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss %xmm1, %xmm2 -; SSE2-NEXT: pslld %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,2] +; SSE2-NEXT: pslld %xmm1, %xmm0 ; SSE2-NEXT: retq ; AVX-LABEL: test2: ; AVX: # BB#0 @@ -80,9 +80,9 @@ define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: test5: ; SSE2: # BB#0 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss %xmm1, %xmm2 -; SSE2-NEXT: psrld %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,2] +; SSE2-NEXT: psrld %xmm1, %xmm0 ; SSE2-NEXT: retq ; AVX-LABEL: test5: ; AVX: # BB#0 @@ -135,9 +135,9 @@ define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) { ; SSE2-LABEL: test8: ; SSE2: # BB#0 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss %xmm1, %xmm2 -; SSE2-NEXT: psrad %xmm2, %xmm0 +; SSE2-NEXT: movq %xmm1, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,2] +; SSE2-NEXT: psrad %xmm1, %xmm0 ; SSE2-NEXT: retq ; AVX-LABEL: test8: ; AVX: # BB#0 Index: test/CodeGen/X86/uint_to_fp-2.ll =================================================================== --- test/CodeGen/X86/uint_to_fp-2.ll +++ test/CodeGen/X86/uint_to_fp-2.ll @@ -25,13 +25,12 @@ ; CHECK-LABEL: test2: ; CHECK: # BB#0: # %entry ; CHECK-NEXT: pushl %eax -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: movss %xmm0, %xmm1 -; CHECK-NEXT: movsd .LCPI1_0, %xmm0 -; CHECK-NEXT: orps %xmm0, %xmm1 -; CHECK-NEXT: subsd %xmm0, %xmm1 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsd2ss %xmm1, %xmm0 +; CHECK-NEXT: movq %xmm0, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; CHECK-NEXT: movsd .LCPI1_0, %xmm1 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: subsd %xmm1, %xmm0 +; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0 ; CHECK-NEXT: movss %xmm0, (%esp) ; CHECK-NEXT: flds (%esp) ; CHECK-NEXT: popl %eax Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -660,23 +660,20 @@ define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_4zzz: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movq %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_4zzz: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: movq %xmm0, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_4zzz: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: movq %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_4zzz: @@ -697,24 +694,25 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_z4zz: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE2-NEXT: movq %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_z4zz: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE3-NEXT: movq %xmm0, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_z4zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] -; SSSE3-NEXT: retq +; SSSE3-NEXT: movq %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; SSSE3-NEXT: retq + ; ; SSE41-LABEL: shuffle_v4i32_z4zz: ; SSE41: # BB#0: @@ -736,23 +734,23 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { ; SSE2-LABEL: shuffle_v4i32_zz4z: ; SSE2: # BB#0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: movss %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE2-NEXT: movq %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4i32_zz4z: ; SSE3: # BB#0: -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: movss %xmm0, %xmm1 -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSE3-NEXT: movq %xmm0, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4i32_zz4z: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: movss %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] +; SSSE3-NEXT: movq %xmm0, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,2] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4i32_zz4z: Index: test/CodeGen/X86/vector-zmov.ll =================================================================== --- test/CodeGen/X86/vector-zmov.ll +++ test/CodeGen/X86/vector-zmov.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 + +define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) { +; SSE-LABEL: load_zmov_4i32_to_0zzz: +; SSE: # BB#0: # %entry +; SSE-NEXT: movd (%rdi), %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: load_zmov_4i32_to_0zzz: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovd (%rdi), %xmm0 +; AVX-NEXT: retq +entry: + %X = load <4 x i32>* %ptr + %Y = shufflevector <4 x i32> %X, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32>%Y +} + +define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) { +; SSE-LABEL: load_zmov_2i64_to_0z: +; SSE: # BB#0: # %entry +; SSE-NEXT: movq (%rdi), %xmm0 +; SSE-NEXT: retq + +; AVX-LABEL: load_zmov_2i64_to_0z: +; AVX: # BB#0: # %entry +; AVX-NEXT: vmovq (%rdi), %xmm0 +; AVX-NEXT: retq +entry: + %X = load <2 x i64>* %ptr + %Y = shufflevector <2 x i64> %X, <2 x i64> zeroinitializer, <2 x i32> + ret <2 x i64>%Y +}