Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -5969,6 +5969,7 @@ { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, + { X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, @@ -5984,6 +5985,7 @@ { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, + // TODO: Add the AVX versions of MOVLPSmr { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll =================================================================== --- test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -26,7 +26,7 @@ } ; CHECK-LABEL: zero_test -; CHECK: pxor %xmm0, %xmm0 +; CHECK: xorps %xmm0, %xmm0 ; CHECK: ret define void @zero_test() { Index: test/CodeGen/X86/2012-07-10-extload64.ll =================================================================== --- test/CodeGen/X86/2012-07-10-extload64.ll +++ test/CodeGen/X86/2012-07-10-extload64.ll @@ -6,7 +6,7 @@ ; CHECK: pmovzxwd %A27 = load <4 x i16>, <4 x i16>* %in, align 4 %A28 = add <4 x i16> %A27, %A27 -; CHECK: movlpd +; CHECK: movq store <4 x i16> %A28, <4 x i16>* %in, align 4 ret void ; CHECK: ret @@ -18,7 +18,7 @@ BB: store <2 x i32> zeroinitializer, <2 x i32>* %ptr ret void -;CHECK: movlpd +;CHECK: movlps ;CHECK: ret } Index: test/CodeGen/X86/exedeps-movq.ll =================================================================== --- test/CodeGen/X86/exedeps-movq.ll +++ test/CodeGen/X86/exedeps-movq.ll @@ -0,0 +1,73 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=AVX + +; Verify that we select the correct version of the instruction that stores the low 64-bits +; of a 128-bit vector. We want to avoid int/fp domain crossing penalties, so ignore the +; bitcast ops and choose: +; +; movlps for floats +; movlpd for doubles +; movq for integers + +define void @store_floats(<4 x float> %x, i64* %p) { +; SSE-LABEL: store_floats: +; SSE: # BB#0: +; SSE-NEXT: addps %xmm0, %xmm0 +; SSE-NEXT: movlps %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_floats: +; AVX: # BB#0: +; AVX-NEXT: vaddps %xmm0, %xmm0, %xmm0 + + +; !!! FIXME - the AVX version is not handled correctly. +; AVX-NEXT: vmovq %xmm0, (%rdi) + + +; AVX-NEXT: retq + %a = fadd <4 x float> %x, %x + %b = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + %c = bitcast <2 x float> %b to i64 + store i64 %c, i64* %p + ret void +} + +define void @store_double(<2 x double> %x, i64* %p) { +; SSE-LABEL: store_double: +; SSE: # BB#0: +; SSE-NEXT: addpd %xmm0, %xmm0 +; SSE-NEXT: movlpd %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_double: +; AVX: # BB#0: +; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovlpd %xmm0, (%rdi) +; AVX-NEXT: retq + %a = fadd <2 x double> %x, %x + %b = extractelement <2 x double> %a, i32 0 + %c = bitcast double %b to i64 + store i64 %c, i64* %p + ret void +} + +define void @store_int(<4 x i32> %x, <2 x float>* %p) { +; SSE-LABEL: store_int: +; SSE: # BB#0: +; SSE-NEXT: paddd %xmm0, %xmm0 +; SSE-NEXT: movq %xmm0, (%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: store_int: +; AVX: # BB#0: +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq + %a = add <4 x i32> %x, %x + %b = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + %c = bitcast <2 x i32> %b to <2 x float> + store <2 x float> %c, <2 x float>* %p + ret void +} + Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -581,7 +581,7 @@ define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) { ; CHECK: test_x86_sse2_storel_dq ; CHECK: movl - ; CHECK: movq + ; CHECK: movlps call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1) ret void } Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: movq %xmm0, (%eax) ; CHECK-NEXT: retl %tmp12 = shl i32 %a, 12 %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1 Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -7,7 +7,7 @@ ; X86-32: ## BB#0: ; X86-32: movd {{[0-9]+}}(%esp), %xmm0 ; X86-32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] -; X86-32-NEXT: movlpd %xmm0, (%esp) +; X86-32-NEXT: movq %xmm0, (%esp) ; X86-32-NEXT: movq (%esp), %mm0 ; X86-32-NEXT: addl $12, %esp ; X86-32-NEXT: retl Index: test/CodeGen/X86/vec_zero_cse.ll =================================================================== --- test/CodeGen/X86/vec_zero_cse.ll +++ test/CodeGen/X86/vec_zero_cse.ll @@ -9,7 +9,7 @@ define void @test1() { ;CHECK-LABEL: @test1 -;CHECK: xorpd +;CHECK: xorps store <1 x i64> zeroinitializer, <1 x i64>* @M1 store <2 x i32> zeroinitializer, <2 x i32>* @M2 ret void Index: test/CodeGen/X86/vector-shuffle-mmx.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-mmx.ll +++ test/CodeGen/X86/vector-shuffle-mmx.ll @@ -9,7 +9,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-NEXT: movlpd %xmm0, (%eax) +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: @@ -38,13 +38,13 @@ ; X32-NEXT: .cfi_def_cfa_offset 24 ; X32-NEXT: Ltmp2: ; X32-NEXT: .cfi_offset %edi, -8 -; X32-NEXT: xorpd %xmm0, %xmm0 -; X32-NEXT: movlpd %xmm0, (%esp) +; X32-NEXT: xorps %xmm0, %xmm0 +; X32-NEXT: movlps %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 ; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movlpd %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movq %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: movq {{[0-9]+}}(%esp), %mm1 ; X32-NEXT: xorl %edi, %edi ; X32-NEXT: maskmovq %mm1, %mm0 @@ -54,8 +54,8 @@ ; ; X64-LABEL: test1: ; X64: ## BB#0: ## %entry -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0 ; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] Index: test/CodeGen/X86/widen_cast-1.ll =================================================================== --- test/CodeGen/X86/widen_cast-1.ll +++ test/CodeGen/X86/widen_cast-1.ll @@ -3,12 +3,14 @@ ; CHECK: movl ; CHECK: paddw -; CHECK: movlpd +; CHECK: movq + +; FIXME - if this test cares about scheduling, why isn't it being checked? ; Scheduler causes produce a different instruction order ; ATOM: movl ; ATOM: paddw -; ATOM: movlpd +; ATOM: movq ; bitcast a v4i16 to v2i32 Index: test/CodeGen/X86/widen_cast-4.ll =================================================================== --- test/CodeGen/X86/widen_cast-4.ll +++ test/CodeGen/X86/widen_cast-4.ll @@ -52,7 +52,7 @@ ; CHECK-NEXT: psraw $8 ; CHECK-NEXT: psraw $2 ; CHECK-NEXT: pshufb -; CHECK-NEXT: movlpd +; CHECK-NEXT: movq ; ; FIXME: We shouldn't require both a movd and an insert. ; CHECK-WIDE: %forbody Index: test/CodeGen/X86/widen_cast-5.ll =================================================================== --- test/CodeGen/X86/widen_cast-5.ll +++ test/CodeGen/X86/widen_cast-5.ll @@ -1,6 +1,6 @@ ; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s ; CHECK: movl -; CHECK: movlpd +; CHECK: movq ; bitcast a i64 to v2i32 define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind { Index: test/CodeGen/X86/widen_shuffle-1.ll =================================================================== --- test/CodeGen/X86/widen_shuffle-1.ll +++ test/CodeGen/X86/widen_shuffle-1.ll @@ -84,7 +84,7 @@ ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33] ; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; CHECK-NEXT: movlpd %xmm0, (%eax) +; CHECK-NEXT: movq %xmm0, (%eax) ; CHECK-NEXT: retl %v = shufflevector <2 x i8> , <2 x i8> undef, <8 x i32> store <8 x i8> %v, <8 x i8>* %p, align 8