Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -28627,10 +28627,11 @@ EVT VT = N->getValueType(0); EVT SrcVT = N0.getValueType(); - // Detect bitcasts between i32 to x86mmx low word. Since MMX types are - // special and don't usually play with other vector types, it's better to - // handle them early to be sure we emit efficient code by avoiding - // store-load conversions. + // Since MMX types are special and don't usually play with other vector types, + // it's better to handle them early to be sure we emit efficient code by + // avoiding store-load conversions. + + // Detect bitcasts between i32 to x86mmx low word. if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) { SDValue N00 = N0->getOperand(0); @@ -28638,6 +28639,14 @@ return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); } + // Detect bitcasts between v2i64/v2f64 extraction to x86mmx. + if (VT == MVT::x86mmx && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType().is128BitVector()) + return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, N00); + } + // Convert a bitcasted integer logic operation that has one bitcasted // floating-point operand into a floating-point logic operation. This may // create a load of a constant, but that is cheaper than materializing the Index: lib/Target/X86/X86InstrMMX.td =================================================================== --- lib/Target/X86/X86InstrMMX.td +++ lib/Target/X86/X86InstrMMX.td @@ -670,6 +670,14 @@ (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), (MMX_MOVFR642Qrr FR64:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))), + (MMX_CVTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))), + (MMX_CVTTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (MMX_CVTPD2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (MMX_CVTTPD2PIirr VR128:$src)>; } Index: test/CodeGen/X86/mmx-cvt.ll =================================================================== --- test/CodeGen/X86/mmx-cvt.ll +++ test/CodeGen/X86/mmx-cvt.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64 -; FIXME: If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents +; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents ; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc. define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind { @@ -11,13 +11,9 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvtpd2dq %xmm0, %xmm0 -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: cvtpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -30,8 +26,7 @@ ; ; X64-LABEL: cvt_v2f64_v2i32: ; X64: # BB#0: -; X64-NEXT: cvtpd2dq %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: cvtpd2pi %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq @@ -52,13 +47,9 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvttpd2dq %xmm0, %xmm0 -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: cvttpd2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -71,8 +62,7 @@ ; ; X64-LABEL: cvtt_v2f64_v2i32: ; X64: # BB#0: -; X64-NEXT: cvttpd2dq %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: cvttpd2pi %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq @@ -131,13 +121,9 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvtps2dq %xmm0, %xmm0 -; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: cvtps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -150,8 +136,7 @@ ; ; X64-LABEL: cvt_v2f32_v2i32: ; X64: # BB#0: -; X64-NEXT: cvtps2dq %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: cvtps2pi %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq @@ -172,13 +157,9 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvttps2dq %xmm0, %xmm0 -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -191,8 +172,7 @@ ; ; X64-LABEL: cvtt_v2f32_v2i32: ; X64: # BB#0: -; X64-NEXT: cvttps2dq %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: cvttps2pi %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq @@ -213,13 +193,9 @@ ; X86-NEXT: pushl %ebp ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp -; X86-NEXT: subl $16, %esp +; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: cvttps2dq %xmm0, %xmm0 -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0 +; X86-NEXT: cvttps2pi %xmm0, %mm0 ; X86-NEXT: paddd %mm0, %mm0 ; X86-NEXT: movq %mm0, (%esp) ; X86-NEXT: movl (%esp), %ecx @@ -232,8 +208,7 @@ ; ; X64-LABEL: fptosi_v2f32_v2i32: ; X64: # BB#0: -; X64-NEXT: cvttps2dq %xmm0, %xmm0 -; X64-NEXT: movdq2q %xmm0, %mm0 +; X64-NEXT: cvttps2pi %xmm0, %mm0 ; X64-NEXT: paddd %mm0, %mm0 ; X64-NEXT: movq %mm0, (%rdi) ; X64-NEXT: retq