Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8961,7 +8961,9 @@ DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } - return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, + return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, + Mask[0] < 0 ? DAG.getUNDEF(MVT::v2f64) : V1, + Mask[1] < 0 ? DAG.getUNDEF(MVT::v2f64) : V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -1300,6 +1300,7 @@ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (VMOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1307,6 +1308,11 @@ (VMOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt + (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; + + def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; @@ -1332,6 +1338,7 @@ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (scalar_to_vector (loadf64 addr:$src2)))), (MOVHPDrm VR128:$src1, addr:$src2)>; + // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1339,6 +1346,11 @@ (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt + (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; + + def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; Index: test/CodeGen/X86/buildvec-insertvec.ll =================================================================== --- test/CodeGen/X86/buildvec-insertvec.ll +++ test/CodeGen/X86/buildvec-insertvec.ll @@ -26,8 +26,8 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) { ; CHECK-LABEL: test_negative_zero_1: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: xorps %xmm2, %xmm2 ; CHECK-NEXT: blendps {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero Index: test/CodeGen/X86/haddsub-2.ll =================================================================== --- test/CodeGen/X86/haddsub-2.ll +++ test/CodeGen/X86/haddsub-2.ll @@ -907,9 +907,9 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: not_a_hsub_2: ; SSE: # BB#0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] -; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: subss %xmm3, %xmm2 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] @@ -917,7 +917,7 @@ ; SSE-NEXT: movaps %xmm1, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3] ; SSE-NEXT: movaps %xmm1, %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] @@ -964,11 +964,11 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { ; SSE-LABEL: not_a_hsub_3: ; SSE: # BB#0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] +; SSE-NEXT: movaps %xmm1, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] +; SSE-NEXT: movaps %xmm0, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: subsd %xmm0, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movapd %xmm2, %xmm0 Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -102,8 +102,8 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: test5_undef: ; SSE: # BB#0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -168,7 +168,7 @@ ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: addss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: addss %xmm2, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] Index: test/CodeGen/X86/nontemporal-2.ll =================================================================== --- test/CodeGen/X86/nontemporal-2.ll +++ test/CodeGen/X86/nontemporal-2.ll @@ -563,7 +563,7 @@ ; ; SSE4A-LABEL: test_extract_f64: ; SSE4A: # BB#0: -; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE4A-NEXT: movntsd %xmm0, (%rdi) ; SSE4A-NEXT: retq ; Index: test/CodeGen/X86/pr11334.ll =================================================================== --- test/CodeGen/X86/pr11334.ll +++ test/CodeGen/X86/pr11334.ll @@ -21,13 +21,13 @@ ; SSE-LABEL: v3f2d_ext_vec: ; SSE: # BB#0: # %entry ; SSE-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvtps2pd %xmm0, %xmm0 ; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: fldl -{{[0-9]+}}(%rsp) -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v3f2d_ext_vec: @@ -43,7 +43,7 @@ ; SSE-LABEL: v4f2d_ext_vec: ; SSE: # BB#0: # %entry ; SSE-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvtps2pd %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq @@ -62,9 +62,9 @@ ; SSE: # BB#0: # %entry ; SSE-NEXT: cvtps2pd %xmm0, %xmm5 ; SSE-NEXT: cvtps2pd %xmm1, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvtps2pd %xmm0, %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvtps2pd %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -3233,13 +3233,13 @@ ; X32-LABEL: test_mm_storeh_sd: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X32-NEXT: movsd %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_mm_storeh_sd: ; X64: # BB#0: -; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-NEXT: movsd %xmm0, (%rdi) ; X64-NEXT: retq %ext = extractelement <2 x double> %a1, i32 1 Index: test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub-2.ll +++ test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -267,8 +267,8 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: test11: ; SSE: # BB#0: -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq @@ -339,8 +339,8 @@ ; SSE: # BB#0: ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: subss %xmm1, %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,1,3] @@ -408,9 +408,9 @@ ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1,0] -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: movaps %xmm1, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 ; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE-NEXT: addss %xmm0, %xmm4 Index: test/CodeGen/X86/sse_partial_update.ll =================================================================== --- test/CodeGen/X86/sse_partial_update.ll +++ test/CodeGen/X86/sse_partial_update.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s ; rdar: 12558838 @@ -77,7 +77,7 @@ ; CHECK: ## BB#0: ## %entry ; CHECK-NEXT: sqrtsd %xmm0, %xmm0 ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2 -; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1 ; CHECK-NEXT: movaps %xmm2, %xmm0 ; CHECK-NEXT: jmp _callee2 ## TAILCALL Index: test/CodeGen/X86/vec_extract.ll =================================================================== --- test/CodeGen/X86/vec_extract.ll +++ test/CodeGen/X86/vec_extract.ll @@ -33,7 +33,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movaps (%eax), %xmm0 ; X32-NEXT: addps %xmm0, %xmm0 -; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X32-NEXT: movss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -43,7 +43,7 @@ ; X64: # BB#0: # %entry ; X64-NEXT: movaps (%rdi), %xmm0 ; X64-NEXT: addps %xmm0, %xmm0 -; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-NEXT: retq entry: %tmp = load <4 x float>, <4 x float>* %F @@ -78,7 +78,7 @@ ; X32: # BB#0: # %entry ; X32-NEXT: subl $12, %esp ; X32-NEXT: calll foo -; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X32-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 ; X32-NEXT: movsd %xmm0, (%esp) ; X32-NEXT: fldl (%esp) @@ -90,7 +90,7 @@ ; X64-NEXT: pushq %rax ; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; X64-NEXT: callq foo -; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload ; X64-NEXT: popq %rax ; X64-NEXT: retq Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -15,7 +15,7 @@ ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -40,7 +40,7 @@ ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -67,7 +67,7 @@ ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -95,13 +95,13 @@ ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] @@ -135,14 +135,14 @@ ; SSE: # BB#0: ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm0 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] @@ -176,8 +176,8 @@ ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx ; SSE-NEXT: movd %rdx, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm2, %xmm3 ; SSE-NEXT: cvttsd2si %xmm3, %rax ; SSE-NEXT: xorq %rcx, %rax @@ -227,8 +227,8 @@ ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx ; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm1, %xmm3 ; SSE-NEXT: cvttsd2si %xmm3, %rax ; SSE-NEXT: xorq %rcx, %rax @@ -280,8 +280,8 @@ ; SSE-NEXT: ucomisd %xmm1, %xmm0 ; SSE-NEXT: cmovaeq %rax, %rdx ; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: subsd %xmm1, %xmm3 ; SSE-NEXT: cvttsd2si %xmm3, %rax ; SSE-NEXT: xorq %rcx, %rax @@ -330,8 +330,8 @@ ; SSE-NEXT: ucomisd %xmm3, %xmm2 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] -; SSE-NEXT: movapd %xmm2, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movaps %xmm2, %xmm4 ; SSE-NEXT: subsd %xmm3, %xmm4 ; SSE-NEXT: cvttsd2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx @@ -348,8 +348,8 @@ ; SSE-NEXT: ucomisd %xmm3, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] -; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subsd %xmm3, %xmm4 ; SSE-NEXT: cvttsd2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx @@ -417,8 +417,8 @@ ; SSE-NEXT: ucomisd %xmm2, %xmm1 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] -; SSE-NEXT: movapd %xmm1, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: subsd %xmm2, %xmm4 ; SSE-NEXT: cvttsd2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx @@ -436,8 +436,8 @@ ; SSE-NEXT: ucomisd %xmm2, %xmm0 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subsd %xmm2, %xmm4 ; SSE-NEXT: cvttsd2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx @@ -568,7 +568,7 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] @@ -611,7 +611,7 @@ ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %rax, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] @@ -658,7 +658,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -802,7 +802,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -818,7 +818,7 @@ ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -891,8 +891,8 @@ ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subss %xmm1, %xmm4 ; SSE-NEXT: cvttss2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx @@ -982,8 +982,8 @@ ; SSE-NEXT: ucomiss %xmm1, %xmm3 ; SSE-NEXT: cmovaeq %rcx, %rdx ; SSE-NEXT: movd %rdx, %xmm3 -; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; SSE-NEXT: movapd %xmm0, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movaps %xmm0, %xmm4 ; SSE-NEXT: subss %xmm1, %xmm4 ; SSE-NEXT: cvttss2si %xmm4, %rcx ; SSE-NEXT: xorq %rax, %rcx Index: test/CodeGen/X86/vector-rem.ll =================================================================== --- test/CodeGen/X86/vector-rem.ll +++ test/CodeGen/X86/vector-rem.ll @@ -99,10 +99,10 @@ ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] Index: test/CodeGen/X86/widen_conv-3.ll =================================================================== --- test/CodeGen/X86/widen_conv-3.ll +++ test/CodeGen/X86/widen_conv-3.ll @@ -74,7 +74,7 @@ ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) @@ -123,7 +123,7 @@ ; X64-SSE2-NEXT: psrad $24, %xmm0 ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) -; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movss %xmm0, 8(%rdi) ; X64-SSE2-NEXT: retq ; Index: test/CodeGen/X86/widen_conv-4.ll =================================================================== --- test/CodeGen/X86/widen_conv-4.ll +++ test/CodeGen/X86/widen_conv-4.ll @@ -19,7 +19,7 @@ ; X86-SSE2-NEXT: movups %xmm0, (%eax) ; X86-SSE2-NEXT: movss %xmm2, 16(%eax) ; X86-SSE2-NEXT: movaps %xmm2, %xmm0 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X86-SSE2-NEXT: movss %xmm0, 24(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm2, 20(%eax) @@ -49,7 +49,7 @@ ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, 16(%rdi) ; X64-SSE2-NEXT: movups %xmm2, (%rdi) -; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movss %xmm0, 24(%rdi) ; X64-SSE2-NEXT: retq ; @@ -100,7 +100,7 @@ ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) ; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0] +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) @@ -148,7 +148,7 @@ ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X64-SSE2-NEXT: movlps %xmm0, (%rdi) -; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movss %xmm0, 8(%rdi) ; X64-SSE2-NEXT: retq ;