diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37498,9 +37498,16 @@ } if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) && AllowFloatDomain) { - V2 = V1; - Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; - SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; + if (SM_SentinelUndef == Mask[1]) { + V2 = V1; + V1 = DAG.getUNDEF(MVT::v4f32); + Shuffle = X86ISD::MOVHLPS; + SrcVT = DstVT = MVT::v4f32; + } else { + V2 = V1; + Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; + SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; + } return true; } if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) && diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -5419,7 +5419,6 @@ case X86::MMX_PUNPCKLBWrr: case X86::MMX_PUNPCKLWDrr: case X86::MMX_PUNPCKLDQrr: - case X86::MOVHLPSrr: case X86::PACKSSWBrr: case X86::PACKUSWBrr: case X86::PACKSSDWrr: @@ -5441,6 +5440,12 @@ // VEX counterparts. return OpNum == 2 && !ForLoadFold; + // MOVHLPSrr is used as a fake move of the upper 64-bits of the second + // source into the lower 64-bits of the destination. So the first source is + // often set to undef. + case X86::MOVHLPSrr: + return (OpNum == 1 || OpNum == 2) && !ForLoadFold; + case X86::VMOVLHPSrr: case X86::VMOVLHPSZrr: case X86::VPACKSSWBrr: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -785,7 +785,7 @@ (MOVHPDrm VR128:$src1, addr:$src2)>; def : Pat<(store (f64 (extractelt - (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), + (bc_v2f64 (v4f32 (X86Movhlps undef, VR128:$src))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -141,8 +141,9 @@ ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm5 ; SSE41-NEXT: cvtps2pd %xmm5, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm5[1,1] -; SSE41-NEXT: cvtps2pd %xmm5, %xmm1 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE41-NEXT: cvtps2pd %xmm1, %xmm1 ; SSE41-NEXT: retq ; ; AVX-LABEL: fpext: diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -194,16 +194,16 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float> %y) { ; SSE-LABEL: combine_vec_fcopysign_fpext_sgn: ; SSE: # %bb.0: -; SSE-NEXT: cvtps2pd %xmm2, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm2[1],xmm3[1] +; SSE-NEXT: cvtps2pd %xmm3, %xmm3 ; SSE-NEXT: cvtps2pd %xmm2, %xmm2 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [NaN,NaN] ; SSE-NEXT: andps %xmm4, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm5 -; SSE-NEXT: andnps %xmm3, %xmm5 +; SSE-NEXT: andnps %xmm2, %xmm5 ; SSE-NEXT: orps %xmm5, %xmm0 ; SSE-NEXT: andps %xmm4, %xmm1 -; SSE-NEXT: andnps %xmm2, %xmm4 +; SSE-NEXT: andnps %xmm3, %xmm4 ; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll --- a/llvm/test/CodeGen/X86/complex-fastmath.ll +++ b/llvm/test/CodeGen/X86/complex-fastmath.ll @@ -57,9 +57,8 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 { ; SSE-LABEL: complex_square_f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: addsd %xmm0, %xmm2 ; SSE-NEXT: mulsd %xmm1, %xmm2 ; SSE-NEXT: mulsd %xmm0, %xmm0 @@ -160,11 +159,9 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 { ; SSE-LABEL: complex_mul_f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movapd %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] -; SSE-NEXT: movapd %xmm3, %xmm4 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps %xmm3, %xmm4 ; SSE-NEXT: mulsd %xmm0, %xmm4 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -188,15 +188,14 @@ ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE2-NEXT: movaps 16(%ecx), %xmm0 -; X32-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X32-SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; X32-SSE2-NEXT: xorps %xmm1, %xmm1 ; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1 ; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE2-NEXT: andps %xmm1, %xmm2 ; X32-SSE2-NEXT: andnps %xmm0, %xmm1 -; X32-SSE2-NEXT: orps %xmm2, %xmm1 -; X32-SSE2-NEXT: movss %xmm1, (%eax) +; X32-SSE2-NEXT: orps %xmm1, %xmm2 +; X32-SSE2-NEXT: movss %xmm2, (%eax) ; X32-SSE2-NEXT: retl ; ; X64-SSSE3-LABEL: PR43971: diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -310,52 +310,53 @@ ; FMACALL64: ## %bb.0: ## %entry ; FMACALL64-NEXT: subq $88, %rsp ## encoding: [0x48,0x83,0xec,0x58] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x20] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] -; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] -; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] -; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] -; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] -; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] -; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] -; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] ; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] +; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] +; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] +; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x20] +; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] +; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x20] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload @@ -363,7 +364,7 @@ ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x20] ; FMACALL64-NEXT: shufps $85, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0x55] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -372,8 +373,8 @@ ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; FMACALL64-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x0c,0x24] +; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x30] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] ; FMACALL64-NEXT: addq $88, %rsp ## encoding: [0x48,0x83,0xc4,0x58] @@ -482,136 +483,135 @@ ; FMACALL64: ## %bb.0: ## %entry ; FMACALL64-NEXT: subq $136, %rsp ## encoding: [0x48,0x81,0xec,0x88,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x50] -; FMACALL64-NEXT: movaps %xmm4, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x24,0x24] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x40] +; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x30] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x60] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] -; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] -; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] -; FMACALL64-NEXT: movaps %xmm2, %xmm1 ## encoding: [0x0f,0x28,0xca] -; FMACALL64-NEXT: shufps $255, %xmm2, %xmm1 ## encoding: [0x0f,0xc6,0xca,0xff] -; FMACALL64-NEXT: ## xmm1 = xmm1[3,3],xmm2[3,3] -; FMACALL64-NEXT: movaps %xmm4, %xmm2 ## encoding: [0x0f,0x28,0xd4] -; FMACALL64-NEXT: shufps $255, %xmm4, %xmm2 ## encoding: [0x0f,0xc6,0xd4,0xff] -; FMACALL64-NEXT: ## xmm2 = xmm2[3,3],xmm4[3,3] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] +; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] +; FMACALL64-NEXT: movhlps %xmm2, %xmm1 ## encoding: [0x0f,0x12,0xca] +; FMACALL64-NEXT: ## xmm1 = xmm2[1],xmm1[1] +; FMACALL64-NEXT: xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2] +; FMACALL64-NEXT: movhlps %xmm4, %xmm2 ## encoding: [0x0f,0x12,0xd4] +; FMACALL64-NEXT: ## xmm2 = xmm4[1],xmm2[1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] +; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] +; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] +; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] +; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] +; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x20] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x70] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x70] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: shufps $85, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0x55] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x70] +; FMACALL64-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x0c,0x24] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x70] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x28] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x38] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x54,0x24,0x48] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] ; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] ; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] ; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] -; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] ; FMACALL64-NEXT: shufps $85, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0x55] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] @@ -624,7 +624,7 @@ ; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x0c,0x24] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] ; FMACALL64-NEXT: addq $136, %rsp ## encoding: [0x48,0x81,0xc4,0x88,0x00,0x00,0x00] ; FMACALL64-NEXT: retq ## encoding: [0xc3] ; @@ -830,66 +830,66 @@ ; FMACALL64: ## %bb.0: ## %entry ; FMACALL64-NEXT: subq $168, %rsp ## encoding: [0x48,0x81,0xec,0xa8,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0xbc,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x7c,0x24,0x70] ; FMACALL64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x74,0x24,0x20] -; FMACALL64-NEXT: movaps %xmm5, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x2c,0x24] +; FMACALL64-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x30] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x70] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x60] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x40] -; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x50] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] -; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] -; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] -; FMACALL64-NEXT: movaps %xmm4, %xmm1 ## encoding: [0x0f,0x28,0xcc] -; FMACALL64-NEXT: shufps $255, %xmm4, %xmm1 ## encoding: [0x0f,0xc6,0xcc,0xff] -; FMACALL64-NEXT: ## xmm1 = xmm1[3,3],xmm4[3,3] -; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] -; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] +; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] +; FMACALL64-NEXT: movhlps %xmm4, %xmm1 ## encoding: [0x0f,0x12,0xcc] +; FMACALL64-NEXT: ## xmm1 = xmm4[1],xmm1[1] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xb8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] +; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] +; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] +; FMACALL64-NEXT: shufps $255, %xmm2, %xmm2 ## encoding: [0x0f,0xc6,0xd2,0xff] +; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x60] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x90,0x00,0x00,0x00] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x40] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x90,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] @@ -898,20 +898,32 @@ ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x90,0x00,0x00,0x00] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x8c,0x24,0x90,0x00,0x00,0x00] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x40] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x60] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x8c,0x24,0x90,0x00,0x00,0x00] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x08] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x18] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xc8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] ; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] @@ -919,41 +931,27 @@ ; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] -; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x10] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] @@ -962,16 +960,28 @@ ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x50] +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x30] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x40] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x58] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x28] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xd8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload @@ -983,37 +993,23 @@ ; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x04,0x24] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] -; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload @@ -1025,21 +1021,33 @@ ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1,1,1] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x4c,0x24,0x30] +; FMACALL64-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x0c,0x24] ; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] -; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] +; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x68] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x78] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xe8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] +; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] ; FMACALL64-NEXT: shufps $255, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0xff] ; FMACALL64-NEXT: ## xmm0 = xmm0[3,3,3,3] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] ; FMACALL64-NEXT: shufps $255, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0xff] ; FMACALL64-NEXT: ## xmm1 = xmm1[3,3,3,3] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] @@ -1047,41 +1055,27 @@ ; FMACALL64-NEXT: ## xmm2 = xmm2[3,3,3,3] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] -; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x14,0x44,0x24,0x20] -; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: unpcklps %xmm0, %xmm1 ## encoding: [0x0f,0x14,0xc8] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x70] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] ; FMACALL64-NEXT: shufps $85, %xmm0, %xmm0 ## encoding: [0x0f,0xc6,0xc0,0x55] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1,1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x8c,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] ; FMACALL64-NEXT: shufps $85, %xmm1, %xmm1 ## encoding: [0x0f,0xc6,0xc9,0x55] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1,1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xe0,0x00,0x00,0x00] @@ -1090,18 +1084,18 @@ ; FMACALL64-NEXT: callq _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x5c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x5c,0x24,0x50] ; FMACALL64-NEXT: unpcklps %xmm0, %xmm3 ## encoding: [0x0f,0x14,0xd8] ; FMACALL64-NEXT: ## xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload ; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x5c,0x24,0x20] ; FMACALL64-NEXT: ## xmm3 = xmm3[0],mem[0] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x90,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] ; FMACALL64-NEXT: addq $168, %rsp ## encoding: [0x48,0x81,0xc4,0xa8,0x00,0x00,0x00] ; FMACALL64-NEXT: retq ## encoding: [0xc3] ; @@ -1441,34 +1435,32 @@ ; FMACALL64: ## %bb.0: ; FMACALL64-NEXT: subq $72, %rsp ## encoding: [0x48,0x83,0xec,0x48] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x20] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] -; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] -; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] -; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] ; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x20] ; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] ; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] -; FMACALL64-NEXT: movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x10] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x04,0x24] +; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0] ; FMACALL64-NEXT: addq $72, %rsp ## encoding: [0x48,0x83,0xc4,0x48] ; FMACALL64-NEXT: retq ## encoding: [0xc3] ; @@ -1580,70 +1572,66 @@ ; FMACALL64-NEXT: subq $120, %rsp ## encoding: [0x48,0x83,0xec,0x78] ; FMACALL64-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x40] -; FMACALL64-NEXT: movaps %xmm4, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x24,0x24] +; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x60] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x30] -; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x60] +; FMACALL64-NEXT: movaps %xmm2, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x14,0x24] ; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x20] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] -; FMACALL64-NEXT: movaps %xmm2, %xmm1 ## encoding: [0x0f,0x28,0xca] -; FMACALL64-NEXT: movaps %xmm4, %xmm2 ## encoding: [0x0f,0x28,0xd4] -; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] -; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] ; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x60] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] +; FMACALL64-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] +; FMACALL64-NEXT: movhlps %xmm2, %xmm1 ## encoding: [0x0f,0x12,0xca] +; FMACALL64-NEXT: ## xmm1 = xmm2[1],xmm1[1] +; FMACALL64-NEXT: xorps %xmm2, %xmm2 ## encoding: [0x0f,0x57,0xd2] +; FMACALL64-NEXT: movhlps %xmm4, %xmm2 ## encoding: [0x0f,0x12,0xd4] +; FMACALL64-NEXT: ## xmm2 = xmm4[1],xmm2[1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] -; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] +; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x60] +; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] +; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel +; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x44,0x24,0x50] +; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x28] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x38] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x54,0x24,0x48] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x40] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] +; FMACALL64-NEXT: movaps %xmm0, %xmm1 ## encoding: [0x0f,0x28,0xc8] +; FMACALL64-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x0c,0x24] +; FMACALL64-NEXT: ## xmm1 = xmm1[0],mem[0] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x10] ; FMACALL64-NEXT: addq $120, %rsp ## encoding: [0x48,0x83,0xc4,0x78] @@ -1778,132 +1766,119 @@ ; FMACALL64-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x7c,0x24,0x70] ; FMACALL64-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x74,0x24,0x20] -; FMACALL64-NEXT: movaps %xmm5, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x2c,0x24] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x74,0x24,0x30] +; FMACALL64-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x6c,0x24,0x10] ; FMACALL64-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x10] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x64,0x24,0x40] ; FMACALL64-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x5c,0x24,0x60] ; FMACALL64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x50] -; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x40] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x54,0x24,0x20] +; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xa0,0x00,0x00,0x00] -; FMACALL64-NEXT: movaps %xmm4, %xmm1 ## encoding: [0x0f,0x28,0xcc] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] +; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: xorps %xmm1, %xmm1 ## encoding: [0x0f,0x57,0xc9] +; FMACALL64-NEXT: movhlps %xmm4, %xmm1 ## encoding: [0x0f,0x12,0xcc] +; FMACALL64-NEXT: ## xmm1 = xmm4[1],xmm1[1] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xa8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x84,0x24,0x80,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x40] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xa0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] -; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x30] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] +; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x50] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x08] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x18] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xb8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x40] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x40] +; FMACALL64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x04,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xb0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] -; FMACALL64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x4c,0x24,0x10] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] +; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x44,0x24,0x40] +; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x10] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x28] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x38] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xc8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x04,0x24] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x20] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x30] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x0c,0x24] -; FMACALL64-NEXT: movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8] -; FMACALL64-NEXT: ## xmm1 = xmm1[0],xmm0[0] -; FMACALL64-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x0c,0x24] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] -; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] +; FMACALL64-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x04,0x24] +; FMACALL64-NEXT: ## xmm0 = xmm0[0],mem[0] +; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x30] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x44,0x24,0x68] +; FMACALL64-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; FMACALL64-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x12,0x4c,0x24,0x78] +; FMACALL64-NEXT: ## xmm1 = mem[0,1],xmm1[2,3] +; FMACALL64-NEXT: movlps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x12,0x94,0x24,0xd8,0x00,0x00,0x00] +; FMACALL64-NEXT: ## xmm2 = mem[0,1],xmm2[2,3] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel ; FMACALL64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; FMACALL64-NEXT: ## encoding: [0x0f,0x29,0x44,0x24,0x20] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x60] -; FMACALL64-NEXT: movhlps %xmm0, %xmm0 ## encoding: [0x0f,0x12,0xc0] -; FMACALL64-NEXT: ## xmm0 = xmm0[1,1] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x70] -; FMACALL64-NEXT: movhlps %xmm1, %xmm1 ## encoding: [0x0f,0x12,0xc9] -; FMACALL64-NEXT: ## xmm1 = xmm1[1,1] ; FMACALL64-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 ## encoding: [0x0f,0x28,0x94,0x24,0xd0,0x00,0x00,0x00] -; FMACALL64-NEXT: movhlps %xmm2, %xmm2 ## encoding: [0x0f,0x12,0xd2] -; FMACALL64-NEXT: ## xmm2 = xmm2[1,1] ; FMACALL64-NEXT: callq _fma ## encoding: [0xe8,A,A,A,A] ; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: reloc_branch_4byte_pcrel -; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x5c,0x24,0x20] -; FMACALL64-NEXT: movlhps %xmm0, %xmm3 ## encoding: [0x0f,0x16,0xd8] -; FMACALL64-NEXT: ## xmm3 = xmm3[0],xmm0[0] +; FMACALL64-NEXT: movaps %xmm0, %xmm3 ## encoding: [0x0f,0x28,0xd8] +; FMACALL64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload +; FMACALL64-NEXT: ## encoding: [0x66,0x0f,0x14,0x5c,0x24,0x20] +; FMACALL64-NEXT: ## xmm3 = xmm3[0],mem[0] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x30] +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x44,0x24,0x50] ; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x4c,0x24,0x10] -; FMACALL64-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload -; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x14,0x24] +; FMACALL64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload +; FMACALL64-NEXT: ## encoding: [0x0f,0x28,0x54,0x24,0x30] ; FMACALL64-NEXT: addq $152, %rsp ## encoding: [0x48,0x81,0xc4,0x98,0x00,0x00,0x00] ; FMACALL64-NEXT: retq ## encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll --- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll @@ -530,10 +530,10 @@ ; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm2 = mem[0,1],xmm2[2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: callq fmaf@PLT @@ -589,10 +589,10 @@ ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm2 = mem[0,1],xmm2[2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] ; NOFMA-NEXT: callq fma@PLT @@ -634,10 +634,10 @@ ; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: callq fmaf@PLT @@ -693,10 +693,10 @@ ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; NOFMA-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] ; NOFMA-NEXT: callq fma@PLT @@ -732,30 +732,30 @@ ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: pxor %xmm3, %xmm0 -; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: pxor %xmm3, %xmm2 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] +; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = mem[3,3,3,3] +; NOFMA-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm2 = mem[3,3,3,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NOFMA-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] @@ -764,7 +764,7 @@ ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: movdqa %xmm1, %xmm0 ; NOFMA-NEXT: addq $88, %rsp @@ -795,23 +795,22 @@ ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] -; NOFMA-NEXT: xorps %xmm3, %xmm0 -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: xorps %xmm3, %xmm2 -; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; NOFMA-NEXT: pxor %xmm3, %xmm0 +; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: pxor %xmm3, %xmm2 +; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; NOFMA-NEXT: callq fma@PLT -; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] +; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fma@PLT -; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; NOFMA-NEXT: movdqa %xmm1, %xmm0 +; NOFMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0] ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq @@ -841,28 +840,28 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 96 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] @@ -871,7 +870,7 @@ ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; NOFMA-NEXT: movaps %xmm1, %xmm0 @@ -912,20 +911,19 @@ ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: callq fma@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; NOFMA-NEXT: callq fma@PLT +; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; NOFMA-NEXT: movaps %xmm1, %xmm0 +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; NOFMA-NEXT: callq fma@PLT +; NOFMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq @@ -957,30 +955,30 @@ ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; NOFMA-NEXT: pxor %xmm3, %xmm0 -; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; NOFMA-NEXT: pxor %xmm3, %xmm2 ; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] -; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] +; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = mem[3,3,3,3] +; NOFMA-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm2 = mem[3,3,3,3] ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; NOFMA-NEXT: callq fmaf@PLT -; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NOFMA-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm0 = mem[1,1,1,1] ; NOFMA-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm2 = mem[1,1,1,1] @@ -989,7 +987,7 @@ ; NOFMA-NEXT: callq fmaf@PLT ; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload +; NOFMA-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0] ; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; NOFMA-NEXT: movdqa %xmm1, %xmm0 @@ -1031,24 +1029,23 @@ ; NOFMA-NEXT: subq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 80 ; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] -; NOFMA-NEXT: xorps %xmm3, %xmm0 -; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; NOFMA-NEXT: xorps %xmm3, %xmm2 -; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; NOFMA-NEXT: pxor %xmm3, %xmm0 +; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: pxor %xmm3, %xmm2 +; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; NOFMA-NEXT: callq fma@PLT -; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm0 = mem[2,3,2,3] -; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; NOFMA-NEXT: # xmm2 = mem[2,3,2,3] +; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; NOFMA-NEXT: callq fma@PLT -; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; NOFMA-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; NOFMA-NEXT: movdqa %xmm1, %xmm0 +; NOFMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NOFMA-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; NOFMA-NEXT: addq $72, %rsp ; NOFMA-NEXT: .cfi_def_cfa_offset 8 ; NOFMA-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -149,25 +149,25 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: subq $56, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 64 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: addq $56, %rsp @@ -206,15 +206,14 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: subq $40, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 48 -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: callq _round ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] ; SSE2-NEXT: addq $40, %rsp ; SSE2-NEXT: retq ; @@ -252,37 +251,37 @@ ; SSE2-NEXT: subq $72, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 80 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill @@ -337,23 +336,24 @@ ; SSE2-NEXT: subq $56, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 64 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: callq _round ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _round +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _round ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: addq $56, %rsp ; SSE2-NEXT: retq @@ -399,39 +399,18 @@ ; SSE2-NEXT: .cfi_def_cfa_offset 112 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload @@ -442,50 +421,71 @@ ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm3 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload ; SSE2-NEXT: addq $104, %rsp ; SSE2-NEXT: retq ; @@ -546,45 +546,46 @@ ; SSE2-NEXT: .cfi_def_cfa_offset 96 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: callq _round ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _round +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _round ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _round +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: unpcklpd (%rsp), %xmm3 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload ; SSE2-NEXT: addq $88, %rsp ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -98,25 +98,25 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: subq $56, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 64 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: addq $56, %rsp @@ -140,15 +140,14 @@ ; SSE2: ## %bb.0: ; SSE2-NEXT: subq $40, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 48 -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: callq _roundeven ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] ; SSE2-NEXT: addq $40, %rsp ; SSE2-NEXT: retq ; @@ -171,37 +170,37 @@ ; SSE2-NEXT: subq $72, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 80 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill @@ -236,23 +235,24 @@ ; SSE2-NEXT: subq $56, %rsp ; SSE2-NEXT: .cfi_def_cfa_offset 64 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: callq _roundeven ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundeven ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: addq $56, %rsp ; SSE2-NEXT: retq @@ -278,39 +278,18 @@ ; SSE2-NEXT: .cfi_def_cfa_offset 112 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload @@ -321,50 +300,71 @@ ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq _roundevenf -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm3 ## 16-byte Reload ; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload ; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload ; SSE2-NEXT: addq $104, %rsp ; SSE2-NEXT: retq ; @@ -397,45 +397,46 @@ ; SSE2-NEXT: .cfi_def_cfa_offset 96 ; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: callq _roundeven ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundeven ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: unpcklpd (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0] ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] +; SSE2-NEXT: callq _roundeven +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq _roundeven -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: unpcklpd (%rsp), %xmm3 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload ; SSE2-NEXT: addq $88, %rsp ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fp128-extract.ll b/llvm/test/CodeGen/X86/fp128-extract.ll --- a/llvm/test/CodeGen/X86/fp128-extract.ll +++ b/llvm/test/CodeGen/X86/fp128-extract.ll @@ -7,14 +7,13 @@ ; CHECK-LABEL: TestExtract: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq __extenddftf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extenddftf2@PLT -; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __extenddftf2@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: jmp __multf3@PLT # TAILCALL entry: diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -6,27 +6,27 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movq %rax, %xmm2 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 @@ -54,17 +54,16 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-LABEL: utest_f64i32: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: subsd %xmm2, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: cvttsd2si %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rax, %rdx -; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: cvttsd2si %xmm1, %rdx +; CHECK-NEXT: andq %rcx, %rdx +; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: subsd %xmm2, %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rcx @@ -73,20 +72,20 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: por %xmm1, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; CHECK-NEXT: retq entry: @@ -100,15 +99,15 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 @@ -118,15 +117,15 @@ ; CHECK-NEXT: pand %xmm4, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-NEXT: por %xmm1, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-NEXT: por %xmm0, %xmm1 @@ -146,14 +145,13 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm4 @@ -252,17 +250,17 @@ ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: cvttss2si %xmm3, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: subss %xmm2, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: cvttss2si %xmm3, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss %xmm2, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -271,13 +269,13 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm4 -; CHECK-NEXT: pxor %xmm3, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm5 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm5 ; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259455,9223372039002259455] ; CHECK-NEXT: movdqa %xmm6, %xmm7 ; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 @@ -285,14 +283,14 @@ ; CHECK-NEXT: pand %xmm5, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm0 +; CHECK-NEXT: pand %xmm5, %xmm3 ; CHECK-NEXT: pandn %xmm2, %xmm5 -; CHECK-NEXT: por %xmm0, %xmm5 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm3, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm6 +; CHECK-NEXT: por %xmm3, %xmm5 +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm0, %xmm4 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] ; CHECK-NEXT: pand %xmm4, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] @@ -313,15 +311,14 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 ; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm0[3,3] ; CHECK-NEXT: cvttss2si %xmm2, %rax ; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm4 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] @@ -330,22 +327,22 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] +; CHECK-NEXT: movdqa %xmm4, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] ; CHECK-NEXT: pxor %xmm9, %xmm9 ; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] ; CHECK-NEXT: movdqa %xmm3, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm7 ; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] ; CHECK-NEXT: pand %xmm5, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm8, %xmm1 -; CHECK-NEXT: por %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3] +; CHECK-NEXT: por %xmm6, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm4 +; CHECK-NEXT: pandn %xmm8, %xmm2 +; CHECK-NEXT: por %xmm4, %xmm2 +; CHECK-NEXT: movdqa %xmm1, %xmm4 ; CHECK-NEXT: pxor %xmm0, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm9, %xmm5 @@ -354,29 +351,29 @@ ; CHECK-NEXT: pand %xmm5, %xmm4 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pandn %xmm8, %xmm3 -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm4 +; CHECK-NEXT: por %xmm1, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm4 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm4, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: por %xmm1, %xmm4 ; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm2, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm3 ; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm0 -; CHECK-NEXT: pand %xmm1, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; CHECK-NEXT: retq entry: @@ -874,22 +871,22 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -951,22 +948,22 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -1075,45 +1072,45 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm3 -; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535] +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: pandn %xmm0, %xmm1 +; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm3 -; CHECK-NEXT: pslld $16, %xmm3 -; CHECK-NEXT: psrad $16, %xmm3 +; CHECK-NEXT: pandn %xmm0, %xmm2 +; CHECK-NEXT: por %xmm3, %xmm2 +; CHECK-NEXT: pxor %xmm3, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: pslld $16, %xmm2 +; CHECK-NEXT: psrad $16, %xmm2 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: packssdw %xmm2, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -1143,33 +1140,33 @@ ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixdfti@PLT -; CHECK-NEXT: movq %rax, %r14 -; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __fixdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF +; CHECK-NEXT: cmpq %rsi, %rbx +; CHECK-NEXT: movq %r14, %rdi +; CHECK-NEXT: sbbq $0, %rdi +; CHECK-NEXT: cmovgeq %rcx, %r14 +; CHECK-NEXT: cmovgeq %rsi, %rbx ; CHECK-NEXT: cmpq %rsi, %rax ; CHECK-NEXT: movq %rdx, %rdi ; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovgeq %rcx, %rdx -; CHECK-NEXT: cmovgeq %rsi, %rax -; CHECK-NEXT: cmpq %rsi, %r14 -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: sbbq $0, %rdi -; CHECK-NEXT: cmovlq %rbx, %rcx -; CHECK-NEXT: cmovlq %r14, %rsi -; CHECK-NEXT: movabsq $-9223372036854775808, %r8 # imm = 0x8000000000000000 -; CHECK-NEXT: cmpq %rsi, %r8 -; CHECK-NEXT: movq $-1, %rbx +; CHECK-NEXT: cmovlq %rdx, %rcx +; CHECK-NEXT: cmovlq %rax, %rsi +; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; CHECK-NEXT: cmpq %rbx, %rax +; CHECK-NEXT: movq $-1, %rdx ; CHECK-NEXT: movq $-1, %rdi -; CHECK-NEXT: sbbq %rcx, %rdi -; CHECK-NEXT: cmovgeq %r8, %rsi -; CHECK-NEXT: cmpq %rax, %r8 -; CHECK-NEXT: sbbq %rdx, %rbx -; CHECK-NEXT: cmovgeq %r8, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movq %rsi, %xmm1 +; CHECK-NEXT: sbbq %r14, %rdi +; CHECK-NEXT: cmovgeq %rax, %rbx +; CHECK-NEXT: cmpq %rsi, %rax +; CHECK-NEXT: sbbq %rcx, %rdx +; CHECK-NEXT: cmovgeq %rax, %rsi +; CHECK-NEXT: movq %rbx, %xmm1 +; CHECK-NEXT: movq %rsi, %xmm0 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 @@ -1200,19 +1197,19 @@ ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: cmovneq %rcx, %rax ; CHECK-NEXT: testq %r14, %r14 ; CHECK-NEXT: cmovneq %rcx, %rbx -; CHECK-NEXT: movq %rbx, %xmm0 -; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovneq %rcx, %rax +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 @@ -1605,27 +1602,27 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movq %rax, %xmm2 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: movdqa %xmm2, %xmm1 +; CHECK-NEXT: pxor %xmm0, %xmm1 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pcmpgtd %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; CHECK-NEXT: por %xmm1, %xmm3 +; CHECK-NEXT: pand %xmm3, %xmm2 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: por %xmm2, %xmm3 ; CHECK-NEXT: pxor %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-NEXT: pcmpeqd %xmm2, %xmm2 @@ -1651,17 +1648,16 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: utest_f64i32_mm: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm1 ; CHECK-NEXT: subsd %xmm2, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: cvttsd2si %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rax, %rdx -; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: cvttsd2si %xmm1, %rdx +; CHECK-NEXT: andq %rcx, %rdx +; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: subsd %xmm2, %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rcx @@ -1670,20 +1666,20 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259455,9223372039002259455] -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: pand %xmm3, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm2, %xmm1 -; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; CHECK-NEXT: por %xmm1, %xmm2 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; CHECK-NEXT: retq entry: @@ -1696,15 +1692,15 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm1, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] ; CHECK-NEXT: pxor %xmm4, %xmm4 ; CHECK-NEXT: pcmpeqd %xmm3, %xmm4 @@ -1714,15 +1710,15 @@ ; CHECK-NEXT: pand %xmm4, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; CHECK-NEXT: por %xmm2, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm1 +; CHECK-NEXT: pand %xmm3, %xmm0 ; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 -; CHECK-NEXT: por %xmm1, %xmm3 -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] +; CHECK-NEXT: por %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm1, %xmm2 +; CHECK-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; CHECK-NEXT: pand %xmm2, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] ; CHECK-NEXT: por %xmm0, %xmm1 @@ -1740,14 +1736,13 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32_mm: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm3 @@ -1844,17 +1839,17 @@ ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: cvttss2si %xmm3, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: subss %xmm2, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %rcx -; CHECK-NEXT: movq %rax, %rdx -; CHECK-NEXT: sarq $63, %rdx +; CHECK-NEXT: cvttss2si %xmm3, %rdx ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: subss %xmm2, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %rcx @@ -1863,34 +1858,34 @@ ; CHECK-NEXT: andq %rcx, %rdx ; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; CHECK-NEXT: movdqa %xmm0, %xmm3 -; CHECK-NEXT: pxor %xmm2, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm4 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] +; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm0, %xmm4 ; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259455,9223372039002259455] ; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm6 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2] +; CHECK-NEXT: pand %xmm4, %xmm2 ; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; CHECK-NEXT: por %xmm2, %xmm4 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; CHECK-NEXT: pand %xmm4, %xmm3 +; CHECK-NEXT: pandn %xmm2, %xmm4 ; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; CHECK-NEXT: pand %xmm4, %xmm0 -; CHECK-NEXT: pandn %xmm3, %xmm4 -; CHECK-NEXT: por %xmm0, %xmm4 -; CHECK-NEXT: movdqa %xmm1, %xmm0 -; CHECK-NEXT: pxor %xmm2, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm2, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm2 +; CHECK-NEXT: movdqa %xmm1, %xmm3 +; CHECK-NEXT: pxor %xmm0, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] +; CHECK-NEXT: pcmpeqd %xmm0, %xmm6 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-NEXT: pand %xmm6, %xmm3 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm2, %xmm0 +; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 +; CHECK-NEXT: pandn %xmm2, %xmm0 ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] ; CHECK-NEXT: retq @@ -1904,14 +1899,13 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32_mm: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttss2si %xmm1, %rax +; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] ; CHECK-NEXT: cvttss2si %xmm1, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm3 @@ -2448,22 +2442,22 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -2523,22 +2517,22 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -2646,45 +2640,45 @@ ; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; CHECK-NEXT: movdqa %xmm1, %xmm2 -; CHECK-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-NEXT: pand %xmm2, %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm2 -; CHECK-NEXT: por %xmm0, %xmm2 -; CHECK-NEXT: movdqa %xmm1, %xmm3 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pcmpgtd %xmm0, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: pandn %xmm1, %xmm3 -; CHECK-NEXT: por %xmm0, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm0 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm0 -; CHECK-NEXT: pand %xmm3, %xmm0 -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm3 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; CHECK-NEXT: # xmm2 = xmm2[0],mem[0] +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535] +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: pcmpgtd %xmm2, %xmm1 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: pandn %xmm0, %xmm1 +; CHECK-NEXT: por %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm0, %xmm2 +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 ; CHECK-NEXT: pand %xmm2, %xmm3 -; CHECK-NEXT: pslld $16, %xmm3 -; CHECK-NEXT: psrad $16, %xmm3 +; CHECK-NEXT: pandn %xmm0, %xmm2 +; CHECK-NEXT: por %xmm3, %xmm2 +; CHECK-NEXT: pxor %xmm3, %xmm3 +; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: pcmpgtd %xmm3, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: pslld $16, %xmm2 +; CHECK-NEXT: psrad $16, %xmm2 ; CHECK-NEXT: pslld $16, %xmm0 ; CHECK-NEXT: psrad $16, %xmm0 -; CHECK-NEXT: packssdw %xmm3, %xmm0 +; CHECK-NEXT: packssdw %xmm2, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -2732,14 +2726,14 @@ ; CHECK-NEXT: cmovsq %rbx, %rcx ; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: cmovsq %r14, %rdi -; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: movabsq $-9223372036854775808, %rbx # imm = 0x8000000000000000 -; CHECK-NEXT: movq %rbx, %rsi -; CHECK-NEXT: cmovnsq %rcx, %rsi ; CHECK-NEXT: cmpq %rbx, %rcx -; CHECK-NEXT: cmovbeq %rbx, %rcx +; CHECK-NEXT: movq %rbx, %rsi +; CHECK-NEXT: cmovaq %rcx, %rsi +; CHECK-NEXT: testq %rdi, %rdi +; CHECK-NEXT: cmovsq %rbx, %rcx ; CHECK-NEXT: cmpq $-1, %rdi -; CHECK-NEXT: cmovneq %rsi, %rcx +; CHECK-NEXT: cmoveq %rsi, %rcx ; CHECK-NEXT: testq %rdx, %rdx ; CHECK-NEXT: movq %rbx, %rsi ; CHECK-NEXT: cmovnsq %rax, %rsi @@ -2777,23 +2771,23 @@ ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: movq %rax, %rbx ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __fixunsdfti@PLT ; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: testq %rdx, %rdx -; CHECK-NEXT: cmovneq %rcx, %rax -; CHECK-NEXT: cmpq $1, %rdx -; CHECK-NEXT: cmoveq %rcx, %rax ; CHECK-NEXT: testq %r14, %r14 ; CHECK-NEXT: cmovneq %rcx, %rbx ; CHECK-NEXT: cmpq $1, %r14 ; CHECK-NEXT: cmoveq %rcx, %rbx -; CHECK-NEXT: movq %rbx, %xmm0 -; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: testq %rdx, %rdx +; CHECK-NEXT: cmovneq %rcx, %rax +; CHECK-NEXT: cmpq $1, %rdx +; CHECK-NEXT: cmoveq %rcx, %rax +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq %rbx, %xmm1 ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 24 diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -15,29 +15,28 @@ define <4 x i1> @test_signed_v4i1_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i1_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: ucomiss %xmm1, %xmm1 ; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: minss %xmm3, %xmm1 +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: minss %xmm4, %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %ecx ; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: ucomiss %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %ecx +; CHECK-NEXT: movd %ecx, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; CHECK-NEXT: ucomiss %xmm1, %xmm1 +; CHECK-NEXT: maxss %xmm2, %xmm1 +; CHECK-NEXT: minss %xmm4, %xmm1 +; CHECK-NEXT: cvttss2si %xmm1, %ecx ; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-NEXT: movd %ecx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 +; CHECK-NEXT: minss %xmm4, %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %ecx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %eax, %ecx @@ -45,12 +44,12 @@ ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: maxss %xmm2, %xmm0 -; CHECK-NEXT: minss %xmm3, %xmm0 +; CHECK-NEXT: minss %xmm4, %xmm0 ; CHECK-NEXT: cvttss2si %xmm0, %ecx ; CHECK-NEXT: cmovpl %eax, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %x = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> %f) @@ -70,8 +69,8 @@ ; CHECK-NEXT: minss %xmm3, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: movaps %xmm2, %xmm4 ; CHECK-NEXT: maxss %xmm3, %xmm4 ; CHECK-NEXT: movaps %xmm1, %xmm3 @@ -117,8 +116,8 @@ ; CHECK-NEXT: cvttss2si %xmm3, %ecx ; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: pinsrw $1, %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: movaps %xmm2, %xmm5 ; CHECK-NEXT: maxss %xmm3, %xmm5 ; CHECK-NEXT: movaps %xmm4, %xmm3 @@ -139,41 +138,40 @@ define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: cvttss2si %xmm1, %edx -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss %xmm2, %xmm1 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm3, %xmm1 ; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF ; CHECK-NEXT: cmoval %eax, %edx ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: ucomiss %xmm1, %xmm1 ; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm3, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm3 +; CHECK-NEXT: movd %edx, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; CHECK-NEXT: cvttss2si %xmm1, %edx +; CHECK-NEXT: ucomiss %xmm3, %xmm1 ; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 +; CHECK-NEXT: ucomiss %xmm1, %xmm1 ; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-NEXT: movd %edx, %xmm1 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 +; CHECK-NEXT: ucomiss %xmm3, %xmm0 ; CHECK-NEXT: cmoval %eax, %edx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ecx, %edx ; CHECK-NEXT: movd %edx, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 +; CHECK-NEXT: ucomiss %xmm3, %xmm0 ; CHECK-NEXT: cmoval %eax, %edx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %ecx, %edx ; CHECK-NEXT: movd %edx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f) @@ -184,39 +182,39 @@ ; CHECK-LABEL: test_signed_v4i64_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: cvttss2si %xmm0, %rdx -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm3, %xmm0 ; CHECK-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmovaq %rax, %rdx ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rcx, %rdx ; CHECK-NEXT: movq %rdx, %xmm2 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; CHECK-NEXT: cvttss2si %xmm3, %rdx -; CHECK-NEXT: ucomiss %xmm1, %xmm3 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: cvttss2si %xmm1, %rdx +; CHECK-NEXT: ucomiss %xmm3, %xmm1 ; CHECK-NEXT: cmovaq %rax, %rdx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 +; CHECK-NEXT: ucomiss %xmm1, %xmm1 ; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm3, %rdx -; CHECK-NEXT: ucomiss %xmm1, %xmm3 +; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttss2si %xmm1, %rdx +; CHECK-NEXT: ucomiss %xmm3, %xmm1 ; CHECK-NEXT: cmovaq %rax, %rdx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 +; CHECK-NEXT: ucomiss %xmm1, %xmm1 ; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm3 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movq %rdx, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: cvttss2si %xmm0, %rdx -; CHECK-NEXT: ucomiss %xmm1, %xmm0 +; CHECK-NEXT: ucomiss %xmm3, %xmm0 ; CHECK-NEXT: cmovaq %rax, %rdx ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm1 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; CHECK-NEXT: movq %rdx, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f) @@ -254,8 +252,8 @@ ; CHECK-NEXT: cmovpq %rbp, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %rbp, %r15 -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT ; CHECK-NEXT: movq %rax, %r13 @@ -345,25 +343,23 @@ define <2 x i1> @test_signed_v2i1_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i1_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomisd %xmm1, %xmm1 ; CHECK-NEXT: maxsd %xmm2, %xmm1 ; CHECK-NEXT: xorpd %xmm3, %xmm3 ; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-NEXT: cmovpq %rcx, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: cvttsd2si %xmm1, %rcx +; CHECK-NEXT: cmovpq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm1 ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: maxsd %xmm2, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %rax -; CHECK-NEXT: cmovpq %rcx, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %rcx +; CHECK-NEXT: cmovpq %rax, %rcx +; CHECK-NEXT: movq %rcx, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x @@ -395,17 +391,17 @@ define <2 x i16> @test_signed_v2i16_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i16_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: movapd %xmm2, %xmm3 +; CHECK-NEXT: maxsd %xmm1, %xmm3 ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movapd %xmm1, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movapd %xmm1, %xmm4 +; CHECK-NEXT: minsd %xmm3, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %eax ; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx +; CHECK-NEXT: minsd %xmm2, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: retq @@ -416,25 +412,23 @@ define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i32_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: ucomisd %xmm1, %xmm1 ; CHECK-NEXT: maxsd %xmm2, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %eax -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: ucomisd %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: cvttsd2si %xmm1, %ecx +; CHECK-NEXT: cmovpl %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: maxsd %xmm2, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %eax -; CHECK-NEXT: cmovpl %ecx, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %ecx +; CHECK-NEXT: cmovpl %eax, %ecx +; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-NEXT: retq %x = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %f) ret <2 x i32> %x @@ -443,24 +437,23 @@ define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i64_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: cvttsd2si %xmm0, %rax +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; CHECK-NEXT: cvttsd2si %xmm1, %rax ; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; CHECK-NEXT: ucomisd %xmm2, %xmm0 +; CHECK-NEXT: ucomisd %xmm2, %xmm1 ; CHECK-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: xorl %edx, %edx -; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: ucomisd %xmm1, %xmm1 ; CHECK-NEXT: cmovpq %rdx, %rax ; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: ucomisd %xmm2, %xmm0 ; CHECK-NEXT: cmovaq %rcx, %rax ; CHECK-NEXT: ucomisd %xmm0, %xmm0 ; CHECK-NEXT: cmovpq %rdx, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f) ret <2 x i64> %x @@ -585,8 +578,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -734,8 +727,8 @@ ; CHECK-NEXT: ucomiss %xmm0, %xmm0 ; CHECK-NEXT: cmovpl %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -833,8 +826,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1013,8 +1006,8 @@ ; CHECK-NEXT: cmovpl %ebx, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1111,8 +1104,8 @@ ; CHECK-NEXT: cmovpq %r15, %rax ; CHECK-NEXT: movq %rax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1236,8 +1229,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovpq %r12, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixsfti@PLT diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -23,8 +23,7 @@ ; CHECK-NEXT: minss %xmm3, %xmm1 ; CHECK-NEXT: cvttss2si %xmm1, %eax ; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; CHECK-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; CHECK-NEXT: maxss %xmm2, %xmm4 ; CHECK-NEXT: minss %xmm3, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax @@ -61,8 +60,8 @@ ; CHECK-NEXT: minss %xmm3, %xmm4 ; CHECK-NEXT: cvttss2si %xmm4, %eax ; CHECK-NEXT: shll $8, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: xorps %xmm4, %xmm4 ; CHECK-NEXT: maxss %xmm3, %xmm4 ; CHECK-NEXT: movaps %xmm1, %xmm3 @@ -108,8 +107,8 @@ ; CHECK-NEXT: cvttss2si %xmm3, %ecx ; CHECK-NEXT: movd %ecx, %xmm1 ; CHECK-NEXT: pinsrw $1, %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; CHECK-NEXT: xorps %xmm5, %xmm5 ; CHECK-NEXT: maxss %xmm3, %xmm5 ; CHECK-NEXT: movaps %xmm4, %xmm3 @@ -142,8 +141,7 @@ ; CHECK-NEXT: movl $-1, %ecx ; CHECK-NEXT: cmoval %ecx, %edx ; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; CHECK-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; CHECK-NEXT: cvttss2si %xmm4, %rdx ; CHECK-NEXT: ucomiss %xmm2, %xmm4 ; CHECK-NEXT: cmovbl %eax, %edx @@ -185,61 +183,60 @@ ; CHECK-NEXT: andq %rax, %rdx ; CHECK-NEXT: orq %rcx, %rdx ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: ucomiss %xmm3, %xmm0 +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: ucomiss %xmm2, %xmm0 ; CHECK-NEXT: cmovbq %rax, %rdx -; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss %xmm4, %xmm0 +; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; CHECK-NEXT: ucomiss %xmm3, %xmm0 ; CHECK-NEXT: movq $-1, %rcx ; CHECK-NEXT: cmovaq %rcx, %rdx -; CHECK-NEXT: movq %rdx, %xmm2 -; CHECK-NEXT: movaps %xmm0, %xmm5 -; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[1,1] -; CHECK-NEXT: movaps %xmm5, %xmm6 -; CHECK-NEXT: subss %xmm1, %xmm6 +; CHECK-NEXT: movaps %xmm0, %xmm4 +; CHECK-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] +; CHECK-NEXT: movaps %xmm0, %xmm6 +; CHECK-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm6, %xmm0 +; CHECK-NEXT: subss %xmm1, %xmm0 +; CHECK-NEXT: cvttss2si %xmm0, %rsi +; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm5, %rsi -; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: movq %rdx, %rdi ; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi -; CHECK-NEXT: ucomiss %xmm3, %xmm5 +; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: orq %rdx, %rdi +; CHECK-NEXT: ucomiss %xmm2, %xmm6 ; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomiss %xmm4, %xmm5 +; CHECK-NEXT: ucomiss %xmm3, %xmm6 ; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm5 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] -; CHECK-NEXT: movaps %xmm0, %xmm5 -; CHECK-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm0[3,3] -; CHECK-NEXT: movaps %xmm5, %xmm6 +; CHECK-NEXT: movq %rdi, %xmm6 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; CHECK-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; CHECK-NEXT: movaps %xmm4, %xmm6 ; CHECK-NEXT: subss %xmm1, %xmm6 ; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm5, %rsi +; CHECK-NEXT: cvttss2si %xmm4, %rsi ; CHECK-NEXT: movq %rsi, %rdi ; CHECK-NEXT: sarq $63, %rdi ; CHECK-NEXT: andq %rdx, %rdi ; CHECK-NEXT: orq %rsi, %rdi -; CHECK-NEXT: ucomiss %xmm3, %xmm5 +; CHECK-NEXT: ucomiss %xmm2, %xmm4 ; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomiss %xmm4, %xmm5 +; CHECK-NEXT: ucomiss %xmm3, %xmm4 ; CHECK-NEXT: cmovaq %rcx, %rdi -; CHECK-NEXT: movq %rdi, %xmm5 -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps %xmm0, %xmm6 +; CHECK-NEXT: movq %rdi, %xmm4 +; CHECK-NEXT: cvttss2si %xmm5, %rdx +; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: movaps %xmm5, %xmm6 ; CHECK-NEXT: subss %xmm1, %xmm6 -; CHECK-NEXT: cvttss2si %xmm6, %rdx -; CHECK-NEXT: cvttss2si %xmm0, %rsi -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi -; CHECK-NEXT: ucomiss %xmm3, %xmm0 +; CHECK-NEXT: cvttss2si %xmm6, %rdi +; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: orq %rdx, %rdi +; CHECK-NEXT: ucomiss %xmm2, %xmm5 ; CHECK-NEXT: cmovbq %rax, %rdi -; CHECK-NEXT: ucomiss %xmm4, %xmm0 +; CHECK-NEXT: ucomiss %xmm3, %xmm5 ; CHECK-NEXT: cmovaq %rcx, %rdi ; CHECK-NEXT: movq %rdi, %xmm1 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; CHECK-NEXT: retq %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) ret <4 x i64> %x @@ -272,8 +269,8 @@ ; CHECK-NEXT: cmovaq %rbp, %rax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %rbp, %r15 -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT ; CHECK-NEXT: movq %rax, %r12 @@ -347,20 +344,18 @@ define <2 x i1> @test_unsigned_v2i1_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i1_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: maxsd %xmm2, %xmm1 +; CHECK-NEXT: xorpd %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; CHECK-NEXT: maxsd %xmm1, %xmm0 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: minsd %xmm3, %xmm1 -; CHECK-NEXT: cvttsd2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm2, %xmm0 ; CHECK-NEXT: minsd %xmm3, %xmm0 ; CHECK-NEXT: cvttsd2si %xmm0, %rax ; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm2 +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %rax +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq %x = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x @@ -392,17 +387,17 @@ define <2 x i16> @test_unsigned_v2i16_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i16_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: xorpd %xmm2, %xmm2 +; CHECK-NEXT: xorpd %xmm3, %xmm3 +; CHECK-NEXT: maxsd %xmm1, %xmm3 +; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: movapd %xmm1, %xmm4 +; CHECK-NEXT: minsd %xmm3, %xmm4 +; CHECK-NEXT: cvttsd2si %xmm4, %eax ; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx +; CHECK-NEXT: minsd %xmm2, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %ecx ; CHECK-NEXT: movd %ecx, %xmm0 ; CHECK-NEXT: pinsrw $1, %eax, %xmm0 ; CHECK-NEXT: retq @@ -455,14 +450,14 @@ ; CHECK-NEXT: cmovaq %rcx, %rdx ; CHECK-NEXT: movq %rdx, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: cvttsd2si %xmm0, %rdx +; CHECK-NEXT: movq %rdx, %rsi +; CHECK-NEXT: sarq $63, %rsi ; CHECK-NEXT: movapd %xmm0, %xmm5 ; CHECK-NEXT: subsd %xmm2, %xmm5 -; CHECK-NEXT: cvttsd2si %xmm5, %rdx -; CHECK-NEXT: cvttsd2si %xmm0, %rsi -; CHECK-NEXT: movq %rsi, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: andq %rdx, %rdi -; CHECK-NEXT: orq %rsi, %rdi +; CHECK-NEXT: cvttsd2si %xmm5, %rdi +; CHECK-NEXT: andq %rsi, %rdi +; CHECK-NEXT: orq %rdx, %rdi ; CHECK-NEXT: ucomisd %xmm3, %xmm0 ; CHECK-NEXT: cmovbq %rax, %rdi ; CHECK-NEXT: ucomisd %xmm4, %xmm0 @@ -578,8 +573,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -706,8 +701,8 @@ ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmoval %r15d, %ebx ; CHECK-NEXT: shll $8, %ebx -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -791,8 +786,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %eax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -945,8 +940,8 @@ ; CHECK-NEXT: cmoval %ebp, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: cvttss2si %xmm0, %rax ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1064,17 +1059,17 @@ ; CHECK-NEXT: cmovaq %rbx, %rdx ; CHECK-NEXT: movq %rdx, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: cvttss2si %xmm0, %rax +; CHECK-NEXT: movq %rax, %rcx +; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: movaps %xmm0, %xmm1 ; CHECK-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: cvttss2si %xmm0, %rcx -; CHECK-NEXT: movq %rcx, %rdx -; CHECK-NEXT: sarq $63, %rdx -; CHECK-NEXT: andq %rax, %rdx -; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: cvttss2si %xmm1, %rdx +; CHECK-NEXT: andq %rcx, %rdx +; CHECK-NEXT: orq %rax, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: cmovbq %r14, %rdx ; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -1188,8 +1183,8 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: cmovaq %r13, %rdx ; CHECK-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq __fixunssfti@PLT diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -100,25 +100,25 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps %xmm4, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT @@ -133,27 +133,27 @@ ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -161,48 +161,48 @@ ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT @@ -217,7 +217,7 @@ ; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, 48(%rbx) -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 16(%rbx) @@ -238,50 +238,50 @@ ; CHECK-NEXT: subq $96, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps %xmm2, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT @@ -313,31 +313,31 @@ ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, (%rbx) ; CHECK-NEXT: addq $64, %rsp @@ -356,60 +356,60 @@ ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm4, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq fmod@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmod@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmod@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, 48(%rbx) -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, 48(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 32(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 16(%rbx) @@ -433,29 +433,29 @@ ; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmod@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmod@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, 16(%rbx) +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, 16(%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $80, %rsp @@ -473,17 +473,17 @@ ; CHECK-NEXT: subq $48, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq fmod@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, (%rbx) +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: addq $48, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq @@ -532,34 +532,35 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -640,34 +641,35 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -748,34 +750,35 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss (%rsp), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -856,34 +859,35 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -988,34 +992,35 @@ ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -1095,35 +1100,36 @@ ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT @@ -1221,35 +1227,36 @@ ; CHECK-NEXT: callq __truncsfhf2@PLT ; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movd %xmm0, (%rsp) # 4-byte Folded Spill -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss (%rsp), %xmm1 # 4-byte Reload -; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT ; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: callq __truncsfhf2@PLT -; CHECK-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-NEXT: punpckldq (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -53,8 +53,8 @@ ; SSE2-NEXT: movq %rdx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -127,17 +127,16 @@ define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v2f64: ; SSE2: # %bb.0: +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE2-NEXT: cvttsd2si %xmm1, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: subsd %xmm2, %xmm1 -; SSE2-NEXT: cvttsd2si %xmm1, %rax -; SSE2-NEXT: cvttsd2si %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: andq %rax, %rdx -; SSE2-NEXT: orq %rcx, %rdx +; SSE2-NEXT: cvttsd2si %xmm1, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: cvttsd2si %xmm0, %rax ; SSE2-NEXT: subsd %xmm2, %xmm0 ; SSE2-NEXT: cvttsd2si %xmm0, %rcx @@ -146,15 +145,14 @@ ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: psrlq $32, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: psrlq $32, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v2f64: @@ -179,63 +177,61 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { ; SSE2-LABEL: trunc_unsigned_v4f64: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd %xmm1, %xmm2 -; SSE2-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: subsd %xmm3, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] +; SSE2-NEXT: cvttsd2si %xmm3, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: subsd %xmm2, %xmm3 +; SSE2-NEXT: cvttsd2si %xmm3, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm3 ; SSE2-NEXT: cvttsd2si %xmm1, %rax -; SSE2-NEXT: cvttsd2si %xmm2, %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: andq %rax, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: cvttsd2si %xmm2, %rax -; SSE2-NEXT: subsd %xmm3, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: subsd %xmm2, %xmm1 +; SSE2-NEXT: cvttsd2si %xmm1, %rcx ; SSE2-NEXT: movq %rax, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movapd %xmm0, %xmm2 -; SSE2-NEXT: subsd %xmm3, %xmm2 -; SSE2-NEXT: cvttsd2si %xmm2, %rax -; SSE2-NEXT: cvttsd2si %xmm0, %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: andq %rax, %rdx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movq %rdx, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movq %rdx, %xmm1 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] +; SSE2-NEXT: cvttsd2si %xmm3, %rax +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: subsd %xmm2, %xmm3 +; SSE2-NEXT: cvttsd2si %xmm3, %rdx +; SSE2-NEXT: andq %rcx, %rdx +; SSE2-NEXT: orq %rax, %rdx +; SSE2-NEXT: movq %rdx, %xmm3 ; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: subsd %xmm3, %xmm0 +; SSE2-NEXT: subsd %xmm2, %xmm0 ; SSE2-NEXT: cvttsd2si %xmm0, %rcx ; SSE2-NEXT: movq %rax, %rdx ; SSE2-NEXT: sarq $63, %rdx ; SSE2-NEXT: andq %rcx, %rdx ; SSE2-NEXT: orq %rax, %rdx ; SSE2-NEXT: movq %rdx, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm2 +; SSE2-NEXT: psrlq $32, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE2-NEXT: por %xmm5, %xmm2 +; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE2-NEXT: subpd %xmm6, %xmm2 -; SSE2-NEXT: addpd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: subpd %xmm6, %xmm0 +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 ; SSE2-NEXT: psrlq $32, %xmm1 ; SSE2-NEXT: por %xmm5, %xmm1 ; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: addpd %xmm0, %xmm1 -; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: retq ; ; SSE41-LABEL: trunc_unsigned_v4f64: @@ -624,12 +620,13 @@ define <2 x double> @trunc_signed_v2f64_nsz(<2 x double> %x) #0 { ; SSE2-LABEL: trunc_signed_v2f64_nsz: ; SSE2: # %bb.0: -; SSE2-NEXT: cvttsd2si %xmm0, %rax -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE2-NEXT: cvttsd2si %xmm1, %rax ; SSE2-NEXT: cvttsd2si %xmm0, %rcx +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sd %rax, %xmm1 ; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: cvtsi2sd %rcx, %xmm1 +; SSE2-NEXT: cvtsi2sd %rcx, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; @@ -655,20 +652,22 @@ define <4 x double> @trunc_signed_v4f64_nsz(<4 x double> %x) #0 { ; SSE2-LABEL: trunc_signed_v4f64_nsz: ; SSE2: # %bb.0: -; SSE2-NEXT: cvttsd2si %xmm1, %rax -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: cvttsd2si %xmm1, %rcx -; SSE2-NEXT: cvttsd2si %xmm0, %rdx -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE2-NEXT: cvttsd2si %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE2-NEXT: cvttsd2si %xmm2, %rcx +; SSE2-NEXT: cvttsd2si %xmm1, %rdx ; SSE2-NEXT: cvttsd2si %xmm0, %rsi -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rdx, %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rsi, %xmm1 +; SSE2-NEXT: cvtsi2sd %rcx, %xmm1 +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sd %rsi, %xmm0 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2sd %rax, %xmm2 ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: cvtsi2sd %rcx, %xmm2 +; SSE2-NEXT: cvtsi2sd %rdx, %xmm1 ; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -885,8 +885,7 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: not_a_hsub_2: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] ; SSE-NEXT: subss %xmm3, %xmm2 @@ -895,8 +894,8 @@ ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm1[3,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE-NEXT: xorps %xmm3, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE-NEXT: subss %xmm3, %xmm2 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE-NEXT: subss %xmm3, %xmm1 @@ -942,11 +941,10 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) { ; SSE-LABEL: not_a_hsub_3: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-NEXT: subsd %xmm0, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE-NEXT: movapd %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/haddsub-3.ll b/llvm/test/CodeGen/X86/haddsub-3.ll --- a/llvm/test/CodeGen/X86/haddsub-3.ll +++ b/llvm/test/CodeGen/X86/haddsub-3.ll @@ -12,8 +12,8 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -21,8 +21,8 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -172,8 +172,7 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) { ; SSE_SLOW-LABEL: hadd_v2f64: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE_SLOW-NEXT: retq @@ -216,8 +215,7 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) { ; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE_SLOW-NEXT: retq @@ -261,11 +259,9 @@ define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) { ; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 -; SSE_SLOW-NEXT: movapd %xmm1, %xmm3 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE_SLOW-NEXT: addsd %xmm1, %xmm3 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0] @@ -296,8 +292,7 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) { ; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE_SLOW-NEXT: addsd %xmm0, %xmm1 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 @@ -350,12 +345,11 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) { ; SSE_SLOW-LABEL: hadd_v4f64: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE_SLOW-NEXT: addsd %xmm0, %xmm2 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0] -; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE_SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE_SLOW-NEXT: addsd %xmm1, %xmm2 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0] ; SSE_SLOW-NEXT: retq @@ -380,8 +374,7 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) { ; SSE_SLOW-LABEL: hsub_v2f64: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE_SLOW-NEXT: subsd %xmm1, %xmm0 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE_SLOW-NEXT: retq @@ -424,12 +417,11 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) { ; SSE_SLOW-LABEL: hsub_v4f64: ; SSE_SLOW: # %bb.0: -; SSE_SLOW-NEXT: movapd %xmm0, %xmm2 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE_SLOW-NEXT: subsd %xmm2, %xmm0 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; SSE_SLOW-NEXT: movapd %xmm1, %xmm2 -; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE_SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE_SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE_SLOW-NEXT: subsd %xmm2, %xmm1 ; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] ; SSE_SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -115,8 +115,7 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) { ; SSE-SLOW-LABEL: test5_undef: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE-SLOW-NEXT: retq ; @@ -189,8 +188,7 @@ ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-SLOW-NEXT: addss %xmm0, %xmm1 -; SSE-SLOW-NEXT: movaps %xmm0, %xmm2 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-SLOW-NEXT: addss %xmm2, %xmm0 ; SSE-SLOW-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] @@ -465,10 +463,17 @@ } define <2 x double> @add_pd_010(<2 x double> %x) { -; SSE-LABEL: add_pd_010: -; SSE: # %bb.0: -; SSE-NEXT: haddpd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE-SLOW-LABEL: add_pd_010: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: add_pd_010: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: haddpd %xmm0, %xmm0 +; SSE-FAST-NEXT: retq ; ; AVX-SLOW-LABEL: add_pd_010: ; AVX-SLOW: # %bb.0: @@ -670,10 +675,8 @@ define <4 x double> @add_pd_011(<4 x double> %0, <4 x double> %1) { ; SSE-SLOW-LABEL: add_pd_011: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-SLOW-NEXT: movapd %xmm0, %xmm3 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE-SLOW-NEXT: addpd %xmm3, %xmm0 ; SSE-SLOW-NEXT: addpd %xmm2, %xmm1 ; SSE-SLOW-NEXT: retq @@ -961,10 +964,9 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { ; SSE-SLOW-LABEL: PR45747_2: ; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE-SLOW-NEXT: addps %xmm1, %xmm0 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-SLOW-NEXT: retq ; ; SSE-FAST-LABEL: PR45747_2: @@ -1127,8 +1129,8 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_u123: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm1 -; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] @@ -1186,8 +1188,8 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_0u23: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm2, %xmm0 -; SSE-SLOW-NEXT: movapd %xmm3, %xmm2 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSE-SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm2 ; SSE-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSE-SLOW-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0] @@ -1233,8 +1235,8 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_01u3: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0 -; SSE-SLOW-NEXT: movapd %xmm3, %xmm1 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE-SLOW-NEXT: addsd %xmm3, %xmm1 ; SSE-SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0] ; SSE-SLOW-NEXT: retq @@ -1287,8 +1289,8 @@ ; SSE-SLOW-LABEL: PR34724_add_v4f64_012u: ; SSE-SLOW: # %bb.0: ; SSE-SLOW-NEXT: haddpd %xmm1, %xmm0 -; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 -; SSE-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSE-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE-SLOW-NEXT: addsd %xmm2, %xmm1 ; SSE-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -43,8 +43,7 @@ define <2 x double> @haddpd3(<2 x double> %x) { ; SSE3-SLOW-LABEL: haddpd3: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -212,8 +211,7 @@ define <2 x double> @hsubpd2(<2 x double> %x) { ; SSE3-SLOW-LABEL: hsubpd2: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -487,8 +485,7 @@ define float @extract_extract23_v4f32_fadd_f32(<4 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -548,8 +545,7 @@ define float @extract_extract23_v4f32_fadd_f32_commute(<4 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -581,8 +577,7 @@ define double @extract_extract01_v2f64_fadd_f64(<2 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -610,8 +605,7 @@ define double @extract_extract01_v2f64_fadd_f64_commute(<2 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -667,8 +661,7 @@ define float @extract_extract23_v4f32_fsub_f32(<4 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 @@ -720,8 +713,7 @@ define float @extract_extract23_v4f32_fsub_f32_commute(<4 x float> %x) { ; SSE3-LABEL: extract_extract23_v4f32_fsub_f32_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-NEXT: subss %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -741,8 +733,7 @@ define double @extract_extract01_v2f64_fsub_f64(<2 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -770,8 +761,7 @@ define double @extract_extract01_v2f64_fsub_f64_commute(<2 x double> %x) { ; SSE3-LABEL: extract_extract01_v2f64_fsub_f64_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: movapd %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: subsd %xmm0, %xmm1 ; SSE3-NEXT: movapd %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -822,8 +812,7 @@ define float @extract_extract23_v8f32_fadd_f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -857,8 +846,7 @@ define float @extract_extract67_v8f32_fadd_f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -924,8 +912,7 @@ define float @extract_extract23_v8f32_fadd_f32_commute(<8 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -959,8 +946,7 @@ define float @extract_extract67_v8f32_fadd_f32_commute(<8 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -996,8 +982,7 @@ define double @extract_extract01_v4f64_fadd_f64(<4 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1027,8 +1012,7 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1061,8 +1045,7 @@ define double @extract_extract01_v4f64_fadd_f64_commute(<4 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1092,8 +1075,7 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1156,8 +1138,7 @@ define float @extract_extract23_v8f32_fsub_f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0 @@ -1248,8 +1229,7 @@ define double @extract_extract01_v4f64_fsub_f64(<4 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1281,8 +1261,7 @@ define double @extract_extract01_v4f64_fsub_f64_commute(<4 x double> %x) { ; SSE3-LABEL: extract_extract01_v4f64_fsub_f64_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: movapd %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: subsd %xmm0, %xmm1 ; SSE3-NEXT: movapd %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -1364,8 +1343,7 @@ define double @extract_extract01_v8f64_fadd_f64(<8 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1395,8 +1373,7 @@ define double @extract_extract01_v8f64_fadd_f64_commute(<8 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1476,8 +1453,7 @@ define double @extract_extract01_v8f64_fsub_f64(<8 x double> %x) { ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0 ; SSE3-SLOW-NEXT: retq ; @@ -1507,8 +1483,7 @@ define double @extract_extract01_v8f64_fsub_f64_commute(<8 x double> %x) { ; SSE3-LABEL: extract_extract01_v8f64_fsub_f64_commute: ; SSE3: # %bb.0: -; SSE3-NEXT: movapd %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: subsd %xmm0, %xmm1 ; SSE3-NEXT: movapd %xmm1, %xmm0 ; SSE3-NEXT: retq @@ -1628,8 +1603,8 @@ ; SSE3-SLOW-LABEL: fadd_reduce_v8f32: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm2, %xmm1 @@ -1673,8 +1648,8 @@ ; SSE3-SLOW-LABEL: fadd_reduce_v4f64: ; SSE3-SLOW: # %bb.0: ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1 -; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE3-SLOW-NEXT: xorps %xmm2, %xmm2 +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2 ; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0 ; SSE3-SLOW-NEXT: retq @@ -1775,8 +1750,7 @@ define float @hadd32_4(<4 x float> %x225) { ; SSE3-SLOW-LABEL: hadd32_4: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -1784,8 +1758,7 @@ ; ; SSE3-FAST-LABEL: hadd32_4: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq @@ -1815,8 +1788,7 @@ define float @hadd32_8(<8 x float> %x225) { ; SSE3-SLOW-LABEL: hadd32_8: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -1824,8 +1796,7 @@ ; ; SSE3-FAST-LABEL: hadd32_8: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq @@ -1857,8 +1828,7 @@ define float @hadd32_16(<16 x float> %x225) { ; SSE3-SLOW-LABEL: hadd32_16: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -1866,8 +1836,7 @@ ; ; SSE3-FAST-LABEL: hadd32_16: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq @@ -1899,8 +1868,7 @@ define float @hadd32_4_optsize(<4 x float> %x225) optsize { ; SSE3-LABEL: hadd32_4_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -1922,8 +1890,7 @@ define float @hadd32_8_optsize(<8 x float> %x225) optsize { ; SSE3-LABEL: hadd32_8_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -1946,8 +1913,7 @@ define float @hadd32_16_optsize(<16 x float> %x225) optsize { ; SSE3-LABEL: hadd32_16_optsize: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -1970,8 +1936,7 @@ define float @hadd32_4_pgso(<4 x float> %x225) !prof !14 { ; SSE3-LABEL: hadd32_4_pgso: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -1993,8 +1958,7 @@ define float @hadd32_8_pgso(<8 x float> %x225) !prof !14 { ; SSE3-LABEL: hadd32_8_pgso: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -2017,8 +1981,7 @@ define float @hadd32_16_pgso(<16 x float> %x225) !prof !14 { ; SSE3-LABEL: hadd32_16_pgso: ; SSE3: # %bb.0: -; SSE3-NEXT: movaps %xmm0, %xmm1 -; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-NEXT: addps %xmm1, %xmm0 ; SSE3-NEXT: haddps %xmm0, %xmm0 ; SSE3-NEXT: retq @@ -2041,8 +2004,7 @@ define float @partial_reduction_fadd_v8f32(<8 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -2050,8 +2012,7 @@ ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq @@ -2085,8 +2046,7 @@ define float @partial_reduction_fadd_v8f32_wrong_flags(<8 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -2094,8 +2054,7 @@ ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq @@ -2127,8 +2086,7 @@ define float @partial_reduction_fadd_v16f32(<16 x float> %x) { ; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32: ; SSE3-SLOW: # %bb.0: -; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -2136,8 +2094,7 @@ ; ; SSE3-FAST-LABEL: partial_reduction_fadd_v16f32: ; SSE3-FAST: # %bb.0: -; SSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSE3-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -569,11 +569,11 @@ ; CHECK-LIBCALL-NEXT: subq $64, %rsp ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx ; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload @@ -611,10 +611,11 @@ ; CHECK-I686-NEXT: movaps %xmm0, %xmm1 ; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm1, (%esp) +; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: calll __truncsfhf2 ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncsfhf2 ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill @@ -656,8 +657,8 @@ ; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -11,8 +11,7 @@ define float @PR37890_v4f32(<4 x float> %a) { ; SSE2-LABEL: PR37890_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -21,8 +20,7 @@ ; ; SSSE3-SLOW-LABEL: PR37890_v4f32: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -68,16 +66,16 @@ ; SSE2-LABEL: PR37890_v4f64: ; SSE2: # %bb.0: ; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-SLOW-LABEL: PR37890_v4f64: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -125,8 +123,8 @@ ; SSE2-LABEL: PR37890_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -136,8 +134,8 @@ ; SSSE3-SLOW-LABEL: PR37890_v8f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -198,8 +196,8 @@ ; SSE2-NEXT: addpd %xmm3, %xmm1 ; SSE2-NEXT: addpd %xmm2, %xmm1 ; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -208,8 +206,8 @@ ; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addsd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: retq ; @@ -267,8 +265,8 @@ ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: addps %xmm2, %xmm1 ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -280,8 +278,8 @@ ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0 @@ -292,8 +290,8 @@ ; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSSE3-FAST-NEXT: xorps %xmm1, %xmm1 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -191,15 +191,14 @@ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: movaps %xmm5, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm1 -; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm2 +; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5 +; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6 -; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm4 -; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm4, %xmm1 +; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6 +; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1] +; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32: @@ -549,8 +548,8 @@ ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm0 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3 @@ -574,8 +573,8 @@ ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm0 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSSE3-FAST-NEXT: xorps %xmm1, %xmm1 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSSE3-FAST-NEXT: addps %xmm0, %xmm1 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSSE3-FAST-NEXT: addps %xmm1, %xmm3 @@ -821,30 +820,29 @@ ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-SLOW-NEXT: xorps %xmm5, %xmm5 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm5 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSSE3-SLOW-NEXT: addss %xmm5, %xmm1 ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-SLOW-NEXT: xorps %xmm4, %xmm4 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm2 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSSE3-SLOW-NEXT: addss %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-SLOW-NEXT: xorps %xmm4, %xmm4 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm4 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSSE3-SLOW-NEXT: addss %xmm4, %xmm3 @@ -856,30 +854,29 @@ ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSSE3-FAST-NEXT: addss %xmm5, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-FAST-NEXT: xorps %xmm5, %xmm5 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSSE3-FAST-NEXT: addss %xmm5, %xmm1 ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSSE3-FAST-NEXT: xorps %xmm4, %xmm4 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm2 ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1 ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSSE3-FAST-NEXT: xorps %xmm4, %xmm4 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSSE3-FAST-NEXT: addss %xmm4, %xmm3 @@ -959,20 +956,18 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3) { ; SSSE3-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-SLOW: # %bb.0: -; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm4 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSSE3-SLOW-NEXT: addps %xmm4, %xmm0 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSSE3-SLOW-NEXT: movaps %xmm1, %xmm5 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm5 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm5[1,1,3,3] ; SSSE3-SLOW-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-SLOW-NEXT: xorps %xmm1, %xmm1 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm2 -; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-SLOW-NEXT: xorps %xmm2, %xmm2 +; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2 ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm3 ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] @@ -985,18 +980,17 @@ ; ; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc: ; SSSE3-FAST: # %bb.0: -; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSSE3-FAST-NEXT: addps %xmm4, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSSE3-FAST-NEXT: xorps %xmm4, %xmm4 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4 ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0 -; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSSE3-FAST-NEXT: xorps %xmm1, %xmm1 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 -; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2 -; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; SSSE3-FAST-NEXT: xorps %xmm2, %xmm2 +; SSSE3-FAST-NEXT: movhlps {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2 ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1 ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] diff --git a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll --- a/llvm/test/CodeGen/X86/inline-asm-x-i128.ll +++ b/llvm/test/CodeGen/X86/inline-asm-x-i128.ll @@ -16,7 +16,7 @@ ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: movq %xmm0, %rax -; CHECK-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movq %xmm0, %rdx ; CHECK-NEXT: retq %3 = zext i64 %1 to i128 diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll --- a/llvm/test/CodeGen/X86/insertps-combine.ll +++ b/llvm/test/CodeGen/X86/insertps-combine.ll @@ -285,8 +285,7 @@ define float @extract_lane_insertps_6123(<4 x float> %a0, ptr%p1) { ; SSE-LABEL: extract_lane_insertps_6123: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movlps 8(%rdi), %xmm0 # xmm0 = mem[0,1],xmm0[2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_6123: diff --git a/llvm/test/CodeGen/X86/load-partial-dot-product.ll b/llvm/test/CodeGen/X86/load-partial-dot-product.ll --- a/llvm/test/CodeGen/X86/load-partial-dot-product.ll +++ b/llvm/test/CodeGen/X86/load-partial-dot-product.ll @@ -19,9 +19,9 @@ ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: dot3_float4: @@ -30,9 +30,9 @@ ; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSSE3-NEXT: addss %xmm1, %xmm0 +; SSSE3-NEXT: addss %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: dot3_float4: @@ -41,9 +41,9 @@ ; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE41-NEXT: addss %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: dot3_float4: @@ -74,9 +74,9 @@ ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: dot3_float4_as_float3: @@ -85,9 +85,9 @@ ; SSSE3-NEXT: movups (%rsi), %xmm1 ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSSE3-NEXT: addss %xmm1, %xmm0 +; SSSE3-NEXT: addss %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: dot3_float4_as_float3: @@ -96,9 +96,9 @@ ; SSE41-NEXT: movups (%rsi), %xmm1 ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE41-NEXT: addss %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: dot3_float4_as_float3: @@ -137,9 +137,10 @@ ; SSE2-NEXT: mulps %xmm0, %xmm1 ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE2-NEXT: addss %xmm1, %xmm0 +; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: dot3_float3: @@ -154,9 +155,10 @@ ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSSE3-NEXT: mulps %xmm0, %xmm1 ; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSSE3-NEXT: addss %xmm1, %xmm0 -; SSSE3-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSSE3-NEXT: addss %xmm1, %xmm0 +; SSSE3-NEXT: addss %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: dot3_float3: @@ -167,9 +169,9 @@ ; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] ; SSE41-NEXT: mulps %xmm0, %xmm1 ; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE41-NEXT: addss %xmm1, %xmm0 +; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: dot3_float3: diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -728,8 +728,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB3_6 ; SSE2-NEXT: LBB3_5: ## %cond.store4 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: movss %xmm1, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $8, %al @@ -883,9 +883,9 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB4_6 ; SSE2-NEXT: LBB4_5: ## %cond.store4 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE2-NEXT: movd %xmm2, (%rdi) +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE2-NEXT: movss %xmm2, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB4_8 @@ -908,8 +908,8 @@ ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB4_14 ; SSE2-NEXT: LBB4_13: ## %cond.store16 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $-128, %al @@ -1167,8 +1167,8 @@ ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] ; SSE2-NEXT: movss %xmm4, 4(%rdi) -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSE2-NEXT: movss %xmm4, 8(%rdi) ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: movss %xmm0, 12(%rdi) @@ -1176,23 +1176,23 @@ ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] ; SSE2-NEXT: movss %xmm0, 20(%rdi) -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 24(%rdi) ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: movss %xmm1, 28(%rdi) ; SSE2-NEXT: movss %xmm2, 32(%rdi) -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] -; SSE2-NEXT: movss %xmm0, 36(%rdi) -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: movss %xmm2, 40(%rdi) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; SSE2-NEXT: movss %xmm2, 36(%rdi) +; SSE2-NEXT: movss %xmm0, 40(%rdi) ; SSE2-NEXT: movss %xmm3, 44(%rdi) ; SSE2-NEXT: movaps %xmm3, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[1,1] ; SSE2-NEXT: movss %xmm0, 48(%rdi) -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 52(%rdi) ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: movss %xmm3, 56(%rdi) @@ -1400,8 +1400,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB6_6 ; SSE2-NEXT: LBB6_5: ## %cond.store4 -; SSE2-NEXT: movaps %xmm0, %xmm8 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE2-NEXT: xorps %xmm8, %xmm8 +; SSE2-NEXT: movhlps {{.*#+}} xmm8 = xmm0[1],xmm8[1] ; SSE2-NEXT: movss %xmm8, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $8, %al @@ -1425,8 +1425,8 @@ ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB6_14 ; SSE2-NEXT: LBB6_13: ## %cond.store16 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testb $-128, %al @@ -1450,8 +1450,8 @@ ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB6_22 ; SSE2-NEXT: LBB6_21: ## %cond.store28 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 @@ -1475,8 +1475,8 @@ ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB6_30 ; SSE2-NEXT: LBB6_29: ## %cond.store40 -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 @@ -1500,8 +1500,8 @@ ; SSE2-NEXT: testl $262144, %eax ## imm = 0x40000 ; SSE2-NEXT: je LBB6_38 ; SSE2-NEXT: LBB6_37: ## %cond.store52 -; SSE2-NEXT: movaps %xmm4, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm4[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $524288, %eax ## imm = 0x80000 @@ -1525,8 +1525,8 @@ ; SSE2-NEXT: testl $4194304, %eax ## imm = 0x400000 ; SSE2-NEXT: je LBB6_46 ; SSE2-NEXT: LBB6_45: ## %cond.store64 -; SSE2-NEXT: movaps %xmm5, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm5[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $8388608, %eax ## imm = 0x800000 @@ -1550,8 +1550,8 @@ ; SSE2-NEXT: testl $67108864, %eax ## imm = 0x4000000 ; SSE2-NEXT: je LBB6_54 ; SSE2-NEXT: LBB6_53: ## %cond.store76 -; SSE2-NEXT: movaps %xmm6, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm6[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $134217728, %eax ## imm = 0x8000000 @@ -1575,8 +1575,8 @@ ; SSE2-NEXT: testl $1073741824, %eax ## imm = 0x40000000 ; SSE2-NEXT: je LBB6_62 ; SSE2-NEXT: LBB6_61: ## %cond.store88 -; SSE2-NEXT: movaps %xmm7, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm7[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, (%rdi) ; SSE2-NEXT: addq $4, %rdi ; SSE2-NEXT: testl $-2147483648, %eax ## imm = 0x80000000 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -309,8 +309,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB4_6 ; SSE2-NEXT: LBB4_5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: movss %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB4_8 @@ -434,8 +434,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB5_6 ; SSE2-NEXT: LBB5_5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: movss %xmm2, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB5_8 @@ -455,8 +455,8 @@ ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB5_14 ; SSE2-NEXT: LBB5_13: ## %cond.store11 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB5_16 @@ -641,8 +641,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB6_6 ; SSE2-NEXT: LBB6_5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSE2-NEXT: movss %xmm4, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB6_8 @@ -662,8 +662,8 @@ ; SSE2-NEXT: testb $64, %al ; SSE2-NEXT: je LBB6_14 ; SSE2-NEXT: LBB6_13: ## %cond.store11 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 24(%rdi) ; SSE2-NEXT: testb $-128, %al ; SSE2-NEXT: je LBB6_16 @@ -683,8 +683,8 @@ ; SSE2-NEXT: testl $1024, %eax ## imm = 0x400 ; SSE2-NEXT: je LBB6_22 ; SSE2-NEXT: LBB6_21: ## %cond.store19 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 40(%rdi) ; SSE2-NEXT: testl $2048, %eax ## imm = 0x800 ; SSE2-NEXT: je LBB6_24 @@ -704,8 +704,8 @@ ; SSE2-NEXT: testl $16384, %eax ## imm = 0x4000 ; SSE2-NEXT: je LBB6_30 ; SSE2-NEXT: LBB6_29: ## %cond.store27 -; SSE2-NEXT: movaps %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 56(%rdi) ; SSE2-NEXT: testl $32768, %eax ## imm = 0x8000 ; SSE2-NEXT: je LBB6_32 @@ -5155,8 +5155,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB27_6 ; SSE2-NEXT: LBB27_5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: movss %xmm1, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB27_8 @@ -5471,8 +5471,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB30_6 ; SSE2-NEXT: LBB30_5: ## %cond.store3 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: movss %xmm2, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB30_8 @@ -5492,8 +5492,8 @@ ; SSE2-NEXT: testb $4, %al ; SSE2-NEXT: je LBB30_14 ; SSE2-NEXT: LBB30_13: ## %cond.store12 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2-NEXT: movss %xmm0, 8(%rdi) ; SSE2-NEXT: testb $8, %al ; SSE2-NEXT: je LBB30_16 diff --git a/llvm/test/CodeGen/X86/pow.ll b/llvm/test/CodeGen/X86/pow.ll --- a/llvm/test/CodeGen/X86/pow.ll +++ b/llvm/test/CodeGen/X86/pow.ll @@ -111,8 +111,8 @@ ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; CHECK-NEXT: callq powf@PLT ; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload @@ -145,8 +145,8 @@ ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -20,14 +20,13 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind { ; SSE-LABEL: v3f2d_ext_vec: ; SSE: # %bb.0: # %entry -; SSE-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvtps2pd %xmm1, %xmm1 ; SSE-NEXT: cvtps2pd %xmm0, %xmm0 -; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE-NEXT: movlps %xmm1, -{{[0-9]+}}(%rsp) +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: fldl -{{[0-9]+}}(%rsp) -; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v3f2d_ext_vec: @@ -42,10 +41,9 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind { ; SSE-LABEL: v4f2d_ext_vec: ; SSE: # %bb.0: # %entry -; SSE-NEXT: cvtps2pd %xmm0, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvtps2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvtps2pd %xmm1, %xmm1 +; SSE-NEXT: cvtps2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v4f2d_ext_vec: @@ -60,13 +58,13 @@ define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind { ; SSE-LABEL: v8f2d_ext_vec: ; SSE: # %bb.0: # %entry -; SSE-NEXT: cvtps2pd %xmm0, %xmm5 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: cvtps2pd %xmm2, %xmm4 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE-NEXT: cvtps2pd %xmm2, %xmm3 +; SSE-NEXT: cvtps2pd %xmm0, %xmm0 ; SSE-NEXT: cvtps2pd %xmm1, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvtps2pd %xmm0, %xmm4 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvtps2pd %xmm1, %xmm3 -; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -670,8 +670,7 @@ ; SSE2_32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2_32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2_32-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2_32-NEXT: movapd %xmm0, %xmm1 -; SSE2_32-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2_32-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2_32-NEXT: addsd %xmm0, %xmm1 ; SSE2_32-NEXT: movsd %xmm1, (%esp) ; SSE2_32-NEXT: fldl (%esp) @@ -684,8 +683,7 @@ ; SSE2_64-NEXT: movq %rdi, %xmm1 ; SSE2_64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE2_64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2_64-NEXT: movapd %xmm1, %xmm0 -; SSE2_64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2_64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2_64-NEXT: addsd %xmm1, %xmm0 ; SSE2_64-NEXT: retq ; @@ -791,8 +789,7 @@ ; SSE2_32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; SSE2_32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; SSE2_32-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; SSE2_32-NEXT: movapd %xmm0, %xmm1 -; SSE2_32-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2_32-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2_32-NEXT: addsd %xmm0, %xmm1 ; SSE2_32-NEXT: movsd %xmm1, (%esp) ; SSE2_32-NEXT: fldl (%esp) @@ -805,8 +802,7 @@ ; SSE2_64-NEXT: movq %rdi, %xmm1 ; SSE2_64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] ; SSE2_64-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2_64-NEXT: movapd %xmm1, %xmm0 -; SSE2_64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2_64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE2_64-NEXT: addsd %xmm1, %xmm0 ; SSE2_64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/split-vector-rem.ll b/llvm/test/CodeGen/X86/split-vector-rem.ll --- a/llvm/test/CodeGen/X86/split-vector-rem.ll +++ b/llvm/test/CodeGen/X86/split-vector-rem.ll @@ -141,50 +141,50 @@ ; CHECK-NEXT: subq $104, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 112 ; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps %xmm2, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2791,7 +2791,7 @@ ; ; X64-SSE2-LABEL: test_mm_storeh_pi: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: punpckhqdq %xmm0, %xmm0 # encoding: [0x66,0x0f,0x6d,0xc0] +; X64-SSE2-NEXT: movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0] ; X64-SSE2-NEXT: # xmm0 = xmm0[1,1] ; X64-SSE2-NEXT: movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0] ; X64-SSE2-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -1478,8 +1478,7 @@ ; X86-SSE-LABEL: PR26515: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: movaps %xmm0, %xmm1 -; X86-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; X86-SSE-NEXT: addss %xmm0, %xmm1 ; X86-SSE-NEXT: movss %xmm1, (%esp) ; X86-SSE-NEXT: flds (%esp) @@ -1498,8 +1497,7 @@ ; ; X64-SSE-LABEL: PR26515: ; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movaps %xmm0, %xmm1 -; X64-SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X64-SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; X64-SSE-NEXT: addss %xmm1, %xmm0 ; X64-SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -361,13 +361,11 @@ define <4 x float> @test14(<4 x float> %A, <4 x float> %B) { ; SSE-LABEL: test14: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: subss %xmm1, %xmm2 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX-LABEL: test14: @@ -442,10 +440,8 @@ ; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: subss %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, %xmm5 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] +; SSE-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSE-NEXT: subss %xmm5, %xmm4 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] ; SSE-NEXT: addss %xmm3, %xmm5 diff --git a/llvm/test/CodeGen/X86/vec-strict-128.ll b/llvm/test/CodeGen/X86/vec-strict-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-128.ll @@ -224,19 +224,21 @@ define <4 x float> @f13(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { ; SSE-X86-LABEL: f13: ; SSE-X86: # %bb.0: -; SSE-X86-NEXT: subl $100, %esp -; SSE-X86-NEXT: .cfi_def_cfa_offset 104 +; SSE-X86-NEXT: subl $104, %esp +; SSE-X86-NEXT: .cfi_def_cfa_offset 108 ; SSE-X86-NEXT: movups %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: movss %xmm2, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: movss %xmm1, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: movss %xmm0, (%esp) +; SSE-X86-NEXT: xorps %xmm0, %xmm0 +; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; SSE-X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; SSE-X86-NEXT: calll fmaf ; SSE-X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill ; SSE-X86-NEXT: wait ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) ; SSE-X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X86-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] @@ -282,7 +284,7 @@ ; SSE-X86-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-X86-NEXT: addl $100, %esp +; SSE-X86-NEXT: addl $104, %esp ; SSE-X86-NEXT: .cfi_def_cfa_offset 4 ; SSE-X86-NEXT: retl ; @@ -292,28 +294,28 @@ ; SSE-X64-NEXT: .cfi_def_cfa_offset 96 ; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE-X64-NEXT: callq fmaf@PLT -; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] -; SSE-X64-NEXT: callq fmaf@PLT -; SSE-X64-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] @@ -322,7 +324,7 @@ ; SSE-X64-NEXT: callq fmaf@PLT ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-X64-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE-X64-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-X64-NEXT: # xmm1 = xmm1[0],mem[0] ; SSE-X64-NEXT: movaps %xmm1, %xmm0 ; SSE-X64-NEXT: addq $88, %rsp @@ -380,19 +382,18 @@ ; SSE-X64-NEXT: .cfi_def_cfa_offset 80 ; SSE-X64-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-X64-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; SSE-X64-NEXT: callq fma@PLT ; SSE-X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-X64-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-X64-NEXT: callq fma@PLT +; SSE-X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-X64-NEXT: movaps %xmm1, %xmm0 +; SSE-X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; SSE-X64-NEXT: callq fma@PLT +; SSE-X64-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; SSE-X64-NEXT: # xmm0 = xmm0[0],mem[0] ; SSE-X64-NEXT: addq $72, %rsp ; SSE-X64-NEXT: .cfi_def_cfa_offset 8 ; SSE-X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128.ll @@ -109,24 +109,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx @@ -148,10 +146,8 @@ ; ; SSE-64-LABEL: test_v4f32_ogt_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm3, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -159,28 +155,28 @@ ; SSE-64-NEXT: cmoval %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmoval %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ogt_q: @@ -263,24 +259,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx @@ -302,10 +296,8 @@ ; ; SSE-64-LABEL: test_v4f32_oge_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm3, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -313,28 +305,28 @@ ; SSE-64-NEXT: cmovael %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmovael %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_oge_q: @@ -417,24 +409,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx @@ -456,10 +446,8 @@ ; ; SSE-64-LABEL: test_v4f32_olt_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm2, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -467,28 +455,28 @@ ; SSE-64-NEXT: cmoval %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmoval %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmoval %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_olt_q: @@ -569,24 +557,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx @@ -608,10 +594,8 @@ ; ; SSE-64-LABEL: test_v4f32_ole_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm2, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -619,28 +603,28 @@ ; SSE-64-NEXT: cmovael %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovael %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmovael %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ole_q: @@ -1014,24 +998,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx @@ -1053,10 +1035,8 @@ ; ; SSE-64-LABEL: test_v4f32_ugt_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm2, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -1064,28 +1044,28 @@ ; SSE-64-NEXT: cmovbl %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmovbl %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ugt_q: @@ -1166,24 +1146,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-32-NEXT: movaps %xmm3, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm2, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm2, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-32-NEXT: movaps %xmm3, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm2, %xmm3 ; SSE-32-NEXT: movl $0, %edx @@ -1205,10 +1183,8 @@ ; ; SSE-64-LABEL: test_v4f32_uge_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm2, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm2[3,3] -; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -1216,28 +1192,28 @@ ; SSE-64-NEXT: cmovbel %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] ; SSE-64-NEXT: movaps %xmm3, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm3[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm2, %xmm3 ; SSE-64-NEXT: cmovbel %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_uge_q: @@ -1318,24 +1294,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx @@ -1357,10 +1331,8 @@ ; ; SSE-64-LABEL: test_v4f32_ult_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm3, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -1368,28 +1340,28 @@ ; SSE-64-NEXT: cmovbl %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbl %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmovbl %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ult_q: @@ -1472,24 +1444,22 @@ ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp ; SSE-32-NEXT: movaps 8(%ebp), %xmm3 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-32-NEXT: movaps %xmm2, %xmm5 -; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax ; SSE-32-NEXT: ucomiss %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm5 -; SSE-32-NEXT: movaps %xmm3, %xmm4 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movaps %xmm3, %xmm5 +; SSE-32-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-32-NEXT: movaps %xmm2, %xmm6 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE-32-NEXT: ucomiss %xmm4, %xmm6 +; SSE-32-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] +; SSE-32-NEXT: ucomiss %xmm5, %xmm6 ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm5 ; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-32-NEXT: ucomiss %xmm3, %xmm2 ; SSE-32-NEXT: movl $0, %edx @@ -1511,10 +1481,8 @@ ; ; SSE-64-LABEL: test_v4f32_ule_q: ; SSE-64: # %bb.0: -; SSE-64-NEXT: movaps %xmm3, %xmm4 -; SSE-64-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3] -; SSE-64-NEXT: movaps %xmm2, %xmm5 -; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm2[3,3] +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax ; SSE-64-NEXT: ucomiss %xmm4, %xmm5 ; SSE-64-NEXT: movl $-1, %ecx @@ -1522,28 +1490,28 @@ ; SSE-64-NEXT: cmovbel %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm4 ; SSE-64-NEXT: movaps %xmm3, %xmm5 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm3[3,3] ; SSE-64-NEXT: movaps %xmm2, %xmm6 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm2[1] +; SSE-64-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3] ; SSE-64-NEXT: ucomiss %xmm5, %xmm6 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx ; SSE-64-NEXT: movd %edx, %xmm5 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbel %ecx, %edx -; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: movd %edx, %xmm5 ; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,1,1] ; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-64-NEXT: ucomiss %xmm3, %xmm2 ; SSE-64-NEXT: cmovbel %ecx, %eax ; SSE-64-NEXT: movd %eax, %xmm2 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] +; SSE-64-NEXT: pand %xmm5, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm5 +; SSE-64-NEXT: por %xmm5, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v4f32_ule_q: @@ -1904,45 +1872,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_ogt_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm3, %xmm2 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovaq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ogt_q: @@ -2024,45 +1992,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_oge_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm3, %xmm2 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaeq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovaeq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_oge_q: @@ -2144,45 +2112,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmoval %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmoval %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_olt_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm3 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovaq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_olt_q: @@ -2262,45 +2230,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovael %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovael %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_ole_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm3 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovaeq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovaeq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ole_q: @@ -2673,45 +2641,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_ugt_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm3 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovbq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ugt_q: @@ -2791,45 +2759,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: ucomisd %xmm2, %xmm4 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm3 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_uge_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm2[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm3[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm2, %xmm3 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbeq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] ; SSE-64-NEXT: ucomisd %xmm2, %xmm3 ; SSE-64-NEXT: cmovbeq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_uge_q: @@ -2909,45 +2877,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbl %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovbl %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_ult_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm3, %xmm2 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovbq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ult_q: @@ -3029,45 +2997,45 @@ ; SSE-32-NEXT: movl %esp, %ebp ; SSE-32-NEXT: andl $-16, %esp ; SSE-32-NEXT: subl $16, %esp -; SSE-32-NEXT: movapd 8(%ebp), %xmm4 +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-32-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-32-NEXT: xorl %eax, %eax -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: ucomisd %xmm4, %xmm5 ; SSE-32-NEXT: movl $-1, %ecx ; SSE-32-NEXT: movl $0, %edx ; SSE-32-NEXT: cmovbel %ecx, %edx -; SSE-32-NEXT: movd %edx, %xmm3 -; SSE-32-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-32-NEXT: ucomisd %xmm4, %xmm2 +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,1,1] +; SSE-32-NEXT: ucomisd %xmm3, %xmm2 ; SSE-32-NEXT: cmovbel %ecx, %eax ; SSE-32-NEXT: movd %eax, %xmm2 ; SSE-32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; SSE-32-NEXT: pand %xmm3, %xmm0 -; SSE-32-NEXT: pandn %xmm1, %xmm3 -; SSE-32-NEXT: por %xmm3, %xmm0 +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-32-NEXT: pand %xmm2, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm2 +; SSE-32-NEXT: por %xmm2, %xmm0 ; SSE-32-NEXT: movl %ebp, %esp ; SSE-32-NEXT: popl %ebp ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: test_v2f64_ule_q: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movhlps {{.*#+}} xmm4 = xmm3[1],xmm4[1] +; SSE-64-NEXT: movhlps {{.*#+}} xmm5 = xmm2[1],xmm5[1] ; SSE-64-NEXT: xorl %eax, %eax -; SSE-64-NEXT: ucomisd %xmm3, %xmm2 +; SSE-64-NEXT: ucomisd %xmm4, %xmm5 ; SSE-64-NEXT: movq $-1, %rcx ; SSE-64-NEXT: movl $0, %edx ; SSE-64-NEXT: cmovbeq %rcx, %rdx ; SSE-64-NEXT: movq %rdx, %xmm4 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] ; SSE-64-NEXT: ucomisd %xmm3, %xmm2 ; SSE-64-NEXT: cmovbeq %rcx, %rax ; SSE-64-NEXT: movq %rax, %xmm2 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; SSE-64-NEXT: pand %xmm4, %xmm0 -; SSE-64-NEXT: pandn %xmm1, %xmm4 -; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] +; SSE-64-NEXT: pand %xmm2, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm2 +; SSE-64-NEXT: por %xmm2, %xmm0 ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: test_v2f64_ule_q: diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -81,13 +81,12 @@ ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: ; SSE-64: # %bb.0: -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-64-NEXT: cvttsd2si %xmm1, %rax ; SSE-64-NEXT: movq %rax, %xmm1 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-64-NEXT: cvttsd2si %xmm0, %rax ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: @@ -280,37 +279,37 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movapd %xmm0, %xmm1 ; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-64-NEXT: comisd %xmm3, %xmm0 ; SSE-64-NEXT: xorpd %xmm2, %xmm2 -; SSE-64-NEXT: xorpd %xmm1, %xmm1 +; SSE-64-NEXT: xorpd %xmm0, %xmm0 ; SSE-64-NEXT: jb .LBB1_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: movapd %xmm3, %xmm0 ; SSE-64-NEXT: .LBB1_2: -; SSE-64-NEXT: movapd %xmm0, %xmm4 -; SSE-64-NEXT: subsd %xmm1, %xmm4 +; SSE-64-NEXT: movapd %xmm1, %xmm4 +; SSE-64-NEXT: subsd %xmm0, %xmm4 ; SSE-64-NEXT: cvttsd2si %xmm4, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx ; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx -; SSE-64-NEXT: movq %rcx, %xmm1 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-64-NEXT: comisd %xmm3, %xmm1 ; SSE-64-NEXT: jb .LBB1_4 ; SSE-64-NEXT: # %bb.3: ; SSE-64-NEXT: movapd %xmm3, %xmm2 ; SSE-64-NEXT: .LBB1_4: -; SSE-64-NEXT: subsd %xmm2, %xmm0 -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: subsd %xmm2, %xmm1 +; SSE-64-NEXT: cvttsd2si %xmm1, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx ; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx -; SSE-64-NEXT: movq %rcx, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: @@ -1561,48 +1560,47 @@ define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 { ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; SSE-32: # %bb.0: +; SSE-32-NEXT: movapd %xmm0, %xmm1 ; SSE-32-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-32-NEXT: comisd %xmm3, %xmm0 ; SSE-32-NEXT: xorpd %xmm2, %xmm2 -; SSE-32-NEXT: xorpd %xmm1, %xmm1 +; SSE-32-NEXT: xorpd %xmm0, %xmm0 ; SSE-32-NEXT: jb .LBB7_2 ; SSE-32-NEXT: # %bb.1: -; SSE-32-NEXT: movapd %xmm3, %xmm1 +; SSE-32-NEXT: movapd %xmm3, %xmm0 ; SSE-32-NEXT: .LBB7_2: ; SSE-32-NEXT: setae %al ; SSE-32-NEXT: movzbl %al, %eax ; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: movapd %xmm0, %xmm4 -; SSE-32-NEXT: subsd %xmm1, %xmm4 +; SSE-32-NEXT: movapd %xmm1, %xmm4 +; SSE-32-NEXT: subsd %xmm0, %xmm4 ; SSE-32-NEXT: cvttsd2si %xmm4, %ecx ; SSE-32-NEXT: xorl %eax, %ecx -; SSE-32-NEXT: movd %ecx, %xmm1 -; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-32-NEXT: comisd %xmm3, %xmm0 +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-32-NEXT: comisd %xmm3, %xmm1 ; SSE-32-NEXT: jb .LBB7_4 ; SSE-32-NEXT: # %bb.3: ; SSE-32-NEXT: movapd %xmm3, %xmm2 ; SSE-32-NEXT: .LBB7_4: -; SSE-32-NEXT: setae %al -; SSE-32-NEXT: movzbl %al, %eax -; SSE-32-NEXT: shll $31, %eax -; SSE-32-NEXT: subsd %xmm2, %xmm0 -; SSE-32-NEXT: cvttsd2si %xmm0, %ecx +; SSE-32-NEXT: subsd %xmm2, %xmm1 +; SSE-32-NEXT: cvttsd2si %xmm1, %eax +; SSE-32-NEXT: setae %cl +; SSE-32-NEXT: movzbl %cl, %ecx +; SSE-32-NEXT: shll $31, %ecx ; SSE-32-NEXT: xorl %eax, %ecx -; SSE-32-NEXT: movd %ecx, %xmm0 -; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-32-NEXT: movdqa %xmm1, %xmm0 +; SSE-32-NEXT: movd %ecx, %xmm1 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i32: ; SSE-64: # %bb.0: -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-64-NEXT: cvttsd2si %xmm1, %rax ; SSE-64-NEXT: movd %eax, %xmm1 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-64-NEXT: cvttsd2si %xmm0, %rax ; SSE-64-NEXT: movd %eax, %xmm0 -; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: @@ -2289,13 +2287,12 @@ ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i1: ; SSE-64: # %bb.0: -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-64-NEXT: cvttsd2si %xmm1, %rax ; SSE-64-NEXT: movq %rax, %xmm1 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-64-NEXT: cvttsd2si %xmm0, %rax ; SSE-64-NEXT: movq %rax, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i1: @@ -2442,37 +2439,37 @@ ; ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: ; SSE-64: # %bb.0: +; SSE-64-NEXT: movapd %xmm0, %xmm1 ; SSE-64-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-64-NEXT: comisd %xmm3, %xmm0 ; SSE-64-NEXT: xorpd %xmm2, %xmm2 -; SSE-64-NEXT: xorpd %xmm1, %xmm1 +; SSE-64-NEXT: xorpd %xmm0, %xmm0 ; SSE-64-NEXT: jb .LBB19_2 ; SSE-64-NEXT: # %bb.1: -; SSE-64-NEXT: movapd %xmm3, %xmm1 +; SSE-64-NEXT: movapd %xmm3, %xmm0 ; SSE-64-NEXT: .LBB19_2: -; SSE-64-NEXT: movapd %xmm0, %xmm4 -; SSE-64-NEXT: subsd %xmm1, %xmm4 +; SSE-64-NEXT: movapd %xmm1, %xmm4 +; SSE-64-NEXT: subsd %xmm0, %xmm4 ; SSE-64-NEXT: cvttsd2si %xmm4, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx ; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx -; SSE-64-NEXT: movq %rcx, %xmm1 -; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-64-NEXT: comisd %xmm3, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-64-NEXT: comisd %xmm3, %xmm1 ; SSE-64-NEXT: jb .LBB19_4 ; SSE-64-NEXT: # %bb.3: ; SSE-64-NEXT: movapd %xmm3, %xmm2 ; SSE-64-NEXT: .LBB19_4: -; SSE-64-NEXT: subsd %xmm2, %xmm0 -; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: subsd %xmm2, %xmm1 +; SSE-64-NEXT: cvttsd2si %xmm1, %rax ; SSE-64-NEXT: setae %cl ; SSE-64-NEXT: movzbl %cl, %ecx ; SSE-64-NEXT: shlq $63, %rcx ; SSE-64-NEXT: xorq %rax, %rcx -; SSE-64-NEXT: movq %rcx, %xmm0 -; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i1: diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -19,13 +19,12 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i64: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm1, %rax ; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptosi_2f64_to_2i64: @@ -123,20 +122,19 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptosi_4f64_to_4i64: ; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: cvttsd2si %xmm2, %rax ; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] +; SSE-NEXT: cvttsd2si %xmm2, %rax +; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 +; SSE-NEXT: movq %rax, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: fptosi_4f64_to_4i64: @@ -250,17 +248,16 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i64: ; SSE: # %bb.0: +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvttsd2si %xmm1, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: sarq $63, %rcx ; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 ; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: sarq $63, %rdx -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx +; SSE-NEXT: cvttsd2si %xmm1, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: cvttsd2si %xmm0, %rax ; SSE-NEXT: subsd %xmm2, %xmm0 ; SSE-NEXT: cvttsd2si %xmm0, %rcx @@ -269,8 +266,7 @@ ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; VEX-LABEL: fptoui_2f64_to_2i64: @@ -510,46 +506,44 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { ; SSE-LABEL: fptoui_4f64_to_4i64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: subsd %xmm3, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: subsd %xmm2, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: cvttsd2si %xmm2, %rcx -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: sarq $63, %rdx -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rcx +; SSE-NEXT: subsd %xmm2, %xmm0 +; SSE-NEXT: cvttsd2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rax -; SSE-NEXT: cvttsd2si %xmm1, %rcx -; SSE-NEXT: movq %rcx, %rdx -; SSE-NEXT: sarq $63, %rdx -; SSE-NEXT: andq %rax, %rdx -; SSE-NEXT: orq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: xorps %xmm3, %xmm3 +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] +; SSE-NEXT: cvttsd2si %xmm3, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: subsd %xmm2, %xmm3 +; SSE-NEXT: cvttsd2si %xmm3, %rdx +; SSE-NEXT: andq %rcx, %rdx +; SSE-NEXT: orq %rax, %rdx +; SSE-NEXT: movq %rdx, %xmm3 ; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: subsd %xmm3, %xmm1 +; SSE-NEXT: subsd %xmm2, %xmm1 ; SSE-NEXT: cvttsd2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx ; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: fptoui_4f64_to_4i64: @@ -941,14 +935,14 @@ ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -1049,14 +1043,14 @@ ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movq %rax, %xmm1 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: cvttss2si %xmm0, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -1520,47 +1514,47 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { ; SSE-LABEL: fptoui_4f32_to_4i64: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: subss %xmm1, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: subss %xmm1, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rcx -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: subss %xmm3, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -1704,47 +1698,47 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { ; SSE-LABEL: fptoui_8f32_to_4i64: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rax +; SSE-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rcx, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rax, %rdx ; SSE-NEXT: orq %rcx, %rdx ; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: subss %xmm1, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rcx +; SSE-NEXT: movaps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3] -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: subss %xmm1, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rcx -; SSE-NEXT: movq %rax, %rdx -; SSE-NEXT: sarq $63, %rdx +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvttss2si %xmm1, %rax +; SSE-NEXT: movq %rax, %rcx +; SSE-NEXT: sarq $63, %rcx +; SSE-NEXT: subss %xmm3, %xmm1 +; SSE-NEXT: cvttss2si %xmm1, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movq %rdx, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: subss %xmm1, %xmm0 +; SSE-NEXT: subss %xmm3, %xmm0 ; SSE-NEXT: cvttss2si %xmm0, %rcx ; SSE-NEXT: movq %rax, %rdx ; SSE-NEXT: sarq $63, %rdx ; SSE-NEXT: andq %rcx, %rdx ; SSE-NEXT: orq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-NEXT: movq %rdx, %xmm0 +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -52,11 +52,10 @@ define <4 x double> @fpext_8f32_to_4f64(<8 x float> %a) { ; SSE-LABEL: fpext_8f32_to_4f64: ; SSE: # %bb.0: -; SSE-NEXT: cvtps2pd %xmm0, %xmm2 # encoding: [0x0f,0x5a,0xd0] -; SSE-NEXT: movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0] -; SSE-NEXT: # xmm0 = xmm0[1,1] -; SSE-NEXT: cvtps2pd %xmm0, %xmm1 # encoding: [0x0f,0x5a,0xc8] -; SSE-NEXT: movaps %xmm2, %xmm0 # encoding: [0x0f,0x28,0xc2] +; SSE-NEXT: movhlps %xmm0, %xmm1 # encoding: [0x0f,0x12,0xc8] +; SSE-NEXT: # xmm1 = xmm0[1],xmm1[1] +; SSE-NEXT: cvtps2pd %xmm1, %xmm1 # encoding: [0x0f,0x5a,0xc9] +; SSE-NEXT: cvtps2pd %xmm0, %xmm0 # encoding: [0x0f,0x5a,0xc0] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX-LABEL: fpext_8f32_to_4f64: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -89,8 +89,8 @@ ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq @@ -483,8 +483,8 @@ ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-NEXT: mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq @@ -621,8 +621,8 @@ ; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: addsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq @@ -761,8 +761,8 @@ ; CHECK-NEXT: movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308] ; CHECK-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq @@ -894,8 +894,8 @@ ; CHECK-NEXT: sqrtsd %xmm0, %xmm1 ; CHECK-NEXT: sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: movsd %xmm1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movapd %xmm0, %xmm1 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; CHECK-NEXT: fldl -{{[0-9]+}}(%rsp) ; CHECK-NEXT: wait ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -12,23 +12,24 @@ ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq sin@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq sin@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: callq sin@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq sin@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -43,23 +44,24 @@ ; CHECK-NEXT: subq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq cos@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq cos@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: callq cos@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq cos@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -77,28 +79,29 @@ ; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: addq $88, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 @@ -120,8 +123,8 @@ ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebx, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -131,8 +134,8 @@ ; CHECK-NEXT: movl %ebx, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebx, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload @@ -164,51 +167,51 @@ ; CHECK-NEXT: movaps (%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: callq exp@PLT -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq exp@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq exp@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq exp@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq exp@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: callq exp@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq exp@PLT +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq exp@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero ; CHECK-NEXT: callq exp@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 16(%rbx) @@ -235,11 +238,11 @@ ; CHECK-NEXT: movaps (%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rdi), %xmm0 -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rdi), %xmm2 -; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rdi), %xmm1 +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movaps (%rsi), %xmm0 @@ -247,63 +250,64 @@ ; CHECK-NEXT: movaps 16(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rsi), %xmm0 -; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rsi), %xmm1 -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps %xmm2, %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rsi), %xmm2 +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq pow@PLT +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = mem[0,1],xmm1[2,3] +; CHECK-NEXT: callq pow@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq pow@PLT -; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload ; CHECK-NEXT: # xmm0 = mem[0],zero ; CHECK-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload ; CHECK-NEXT: # xmm1 = mem[0],zero ; CHECK-NEXT: callq pow@PLT ; CHECK-NEXT: movsd %xmm0, 64(%rbx) -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, (%rbx) ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps %xmm0, 16(%rbx) @@ -341,8 +345,8 @@ ; CHECK-NEXT: movl %esi, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -352,8 +356,8 @@ ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -363,8 +367,8 @@ ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -374,8 +378,8 @@ ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movlps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: movl %ebp, %edi ; CHECK-NEXT: callq __powidf2@PLT ; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -154,8 +154,8 @@ ; SSE-NEXT: mulpd %xmm2, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm1 ; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; SSE-NEXT: retq ; ; AVX1-LABEL: fmul_v2f64: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd-fast.ll @@ -60,8 +60,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -71,8 +70,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm1 @@ -120,8 +118,8 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -132,8 +130,8 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm1 @@ -195,8 +193,8 @@ ; SSE2-NEXT: addps %xmm4, %xmm2 ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: addps %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -209,8 +207,8 @@ ; SSE41-NEXT: addps %xmm4, %xmm2 ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm1 @@ -319,8 +317,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -329,8 +326,7 @@ ; ; SSE41-LABEL: test_v4f32_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -373,8 +369,8 @@ ; SSE2-LABEL: test_v8f32_zero: ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -384,8 +380,8 @@ ; SSE41-LABEL: test_v8f32_zero: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -442,8 +438,8 @@ ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -455,8 +451,8 @@ ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -560,8 +556,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -570,8 +565,7 @@ ; ; SSE41-LABEL: test_v4f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -614,8 +608,8 @@ ; SSE2-LABEL: test_v8f32_undef: ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -625,8 +619,8 @@ ; SSE41-LABEL: test_v8f32_undef: ; SSE41: # %bb.0: ; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -683,8 +677,8 @@ ; SSE2-NEXT: addps %xmm3, %xmm1 ; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -696,8 +690,8 @@ ; SSE41-NEXT: addps %xmm3, %xmm1 ; SSE41-NEXT: addps %xmm2, %xmm0 ; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -761,8 +755,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: addsd %xmm1, %xmm2 ; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -801,8 +794,8 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: addsd %xmm1, %xmm2 ; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -855,8 +848,8 @@ ; SSE-NEXT: addpd %xmm4, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: addsd %xmm1, %xmm2 ; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -914,12 +907,12 @@ ; SSE-NEXT: addpd %xmm6, %xmm2 ; SSE-NEXT: addpd %xmm7, %xmm3 ; SSE-NEXT: addpd %xmm5, %xmm1 -; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm1 +; SSE-NEXT: addpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: addpd %xmm2, %xmm4 ; SSE-NEXT: addpd %xmm1, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE-NEXT: addsd %xmm4, %xmm1 ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -985,8 +978,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1020,8 +1012,8 @@ ; SSE-LABEL: test_v4f64_zero: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1069,8 +1061,8 @@ ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1127,8 +1119,8 @@ ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1189,8 +1181,7 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_undef: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1224,8 +1215,8 @@ ; SSE-LABEL: test_v4f64_undef: ; SSE: # %bb.0: ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1273,8 +1264,8 @@ ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm2, %xmm0 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1331,8 +1322,8 @@ ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -50,8 +50,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -62,8 +62,8 @@ ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -101,8 +101,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE2-NEXT: addss %xmm3, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -110,8 +110,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -122,16 +122,16 @@ ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 @@ -187,8 +187,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm5, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSE2-NEXT: addss %xmm5, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -196,8 +196,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -205,8 +205,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: addss %xmm3, %xmm0 @@ -214,8 +214,8 @@ ; SSE2-NEXT: movaps %xmm4, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm4, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE2-NEXT: addss %xmm4, %xmm0 @@ -226,32 +226,32 @@ ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm5, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE41-NEXT: xorps %xmm5, %xmm5 +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSE41-NEXT: addss %xmm5, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: addss %xmm4, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm4, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE41-NEXT: addss %xmm4, %xmm0 @@ -381,8 +381,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: addss %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -392,8 +391,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: addss %xmm1, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 @@ -447,8 +445,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE2-NEXT: addss %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm3, %xmm0 @@ -456,8 +453,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -467,16 +464,15 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE41-NEXT: addss %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -566,8 +562,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] ; SSE2-NEXT: addss %xmm0, %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE2-NEXT: addss %xmm4, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm5, %xmm0 @@ -575,8 +570,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE2-NEXT: addss %xmm4, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -584,8 +579,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -593,8 +588,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: addss %xmm3, %xmm0 @@ -604,32 +599,31 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: addss %xmm0, %xmm4 -; SSE41-NEXT: movaps %xmm0, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE41-NEXT: addss %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm5, %xmm0 ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE41-NEXT: addss %xmm4, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 @@ -812,8 +806,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: addss %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -823,8 +816,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: addss %xmm1, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 @@ -859,8 +851,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE2-NEXT: addss %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm3, %xmm0 @@ -868,8 +859,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: addss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -879,16 +870,15 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE41-NEXT: addss %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 @@ -941,8 +931,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE2-NEXT: addss %xmm4, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: addss %xmm5, %xmm0 @@ -950,8 +939,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE2-NEXT: addss %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE2-NEXT: addss %xmm4, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: addss %xmm1, %xmm0 @@ -959,8 +948,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: addss %xmm2, %xmm0 @@ -968,8 +957,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: addss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: addss %xmm3, %xmm0 @@ -979,32 +968,31 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE41-NEXT: movaps %xmm0, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE41-NEXT: addss %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: addss %xmm5, %xmm0 ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: addss %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE41-NEXT: addss %xmm4, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: addss %xmm2, %xmm0 ; SSE41-NEXT: addss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: addss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: addss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: addss %xmm3, %xmm0 @@ -1118,8 +1106,9 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: @@ -1156,14 +1145,17 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: addsd %xmm4, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: @@ -1216,26 +1208,33 @@ ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm2, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: addsd %xmm2, %xmm0 -; SSE2-NEXT: addsd %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm3, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm4, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE2-NEXT: addsd %xmm4, %xmm0 -; SSE2-NEXT: addsd %xmm5, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm5, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm6, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE2-NEXT: addsd %xmm6, %xmm0 -; SSE2-NEXT: addsd %xmm7, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm7, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: addsd %xmm8, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE2-NEXT: addsd %xmm8, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; SSE2-NEXT: addsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f64: @@ -1244,23 +1243,29 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm2, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: addsd %xmm2, %xmm0 -; SSE41-NEXT: addsd %xmm3, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm3, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm4, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE41-NEXT: addsd %xmm4, %xmm0 -; SSE41-NEXT: addsd %xmm5, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm5, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm6, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE41-NEXT: addsd %xmm6, %xmm0 -; SSE41-NEXT: addsd %xmm7, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd %xmm7, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE41-NEXT: addsd %xmm1, %xmm0 ; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: retq @@ -1343,8 +1348,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1377,8 +1381,7 @@ define double @test_v4f64_zero(<4 x double> %a0) { ; SSE-LABEL: test_v4f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-NEXT: addsd %xmm2, %xmm0 ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] @@ -1434,18 +1437,19 @@ define double @test_v8f64_zero(<8 x double> %a0) { ; SSE-LABEL: test_v8f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSE-NEXT: addsd %xmm4, %xmm0 ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v8f64_zero: @@ -1526,30 +1530,35 @@ define double @test_v16f64_zero(<16 x double> %a0) { ; SSE-LABEL: test_v16f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm8 = xmm0[1],xmm8[1] ; SSE-NEXT: addsd %xmm8, %xmm0 ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: addsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: addsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm7, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16f64_zero: @@ -1754,11 +1763,13 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: @@ -1810,23 +1821,29 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: addsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: addsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: addsd %xmm7, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE-NEXT: addsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_undef: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-fmin-fast.ll @@ -46,8 +46,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -56,8 +55,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 @@ -86,8 +84,8 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: minps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -97,8 +95,8 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: minps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: minss %xmm1, %xmm0 @@ -135,8 +133,8 @@ ; SSE2-NEXT: maxps %xmm3, %xmm1 ; SSE2-NEXT: maxps %xmm2, %xmm0 ; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -148,8 +146,8 @@ ; SSE41-NEXT: maxps %xmm3, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm0 ; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 @@ -190,8 +188,7 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -214,8 +211,8 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -246,8 +243,8 @@ ; SSE-NEXT: minpd %xmm3, %xmm1 ; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -285,8 +282,8 @@ ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: maxpd %xmm3, %xmm1 ; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -51,8 +51,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -61,8 +60,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 @@ -91,8 +89,8 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -102,8 +100,8 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 @@ -140,8 +138,8 @@ ; SSE2-NEXT: maxps %xmm3, %xmm1 ; SSE2-NEXT: maxps %xmm2, %xmm0 ; SSE2-NEXT: maxps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -153,8 +151,8 @@ ; SSE41-NEXT: maxps %xmm3, %xmm1 ; SSE41-NEXT: maxps %xmm2, %xmm0 ; SSE41-NEXT: maxps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: maxss %xmm1, %xmm0 @@ -195,8 +193,7 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -221,8 +218,8 @@ ; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[0],mem[1] ; SSE2-NEXT: maxpd %xmm2, %xmm0 -; SSE2-NEXT: movapd %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: maxsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -231,8 +228,8 @@ ; SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],mem[1] ; SSE41-NEXT: maxpd %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: maxsd %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -261,8 +258,8 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -293,8 +290,8 @@ ; SSE-NEXT: maxpd %xmm3, %xmm1 ; SSE-NEXT: maxpd %xmm2, %xmm0 ; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -332,8 +329,8 @@ ; SSE-NEXT: maxpd %xmm5, %xmm1 ; SSE-NEXT: maxpd %xmm3, %xmm1 ; SSE-NEXT: maxpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: maxsd %xmm1, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -136,7 +136,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 @@ -145,7 +145,6 @@ ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: maxss %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE2-NEXT: andnps %xmm3, %xmm1 ; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm3 @@ -167,7 +166,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: cmpunordss %xmm0, %xmm1 @@ -175,7 +174,6 @@ ; SSE41-NEXT: andps %xmm3, %xmm4 ; SSE41-NEXT: maxss %xmm0, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE41-NEXT: andnps %xmm3, %xmm1 ; SSE41-NEXT: orps %xmm4, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm3 @@ -280,8 +278,8 @@ ; SSE41-NEXT: maxss %xmm2, %xmm1 ; SSE41-NEXT: andnps %xmm1, %xmm0 ; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: movaps %xmm1, %xmm3 ; SSE41-NEXT: maxss %xmm0, %xmm3 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 @@ -473,8 +471,8 @@ ; SSE41-NEXT: maxss %xmm1, %xmm2 ; SSE41-NEXT: andnps %xmm2, %xmm0 ; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movaps %xmm2, %xmm3 ; SSE41-NEXT: maxss %xmm0, %xmm3 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 @@ -668,9 +666,8 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: cmpunordsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm3 ; SSE-NEXT: andpd %xmm2, %xmm3 @@ -726,8 +723,8 @@ ; SSE41-NEXT: maxpd %xmm0, %xmm2 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 @@ -822,8 +819,8 @@ ; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 @@ -1015,8 +1012,8 @@ ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -51,23 +51,19 @@ define float @test_v3f32(<3 x float> %a0) { ; SSE2-LABEL: test_v3f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: minss %xmm2, %xmm1 -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: minss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE2-NEXT: minss %xmm1, %xmm0 +; SSE2-NEXT: minss %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v3f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: minss %xmm2, %xmm1 -; SSE41-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE41-NEXT: minss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE41-NEXT: minss %xmm1, %xmm0 +; SSE41-NEXT: minss %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_v3f32: @@ -92,8 +88,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: minps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -102,8 +97,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: minps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: minss %xmm1, %xmm0 @@ -132,8 +126,8 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: minps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -143,8 +137,8 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: minps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: minss %xmm1, %xmm0 @@ -181,8 +175,8 @@ ; SSE2-NEXT: minps %xmm3, %xmm1 ; SSE2-NEXT: minps %xmm2, %xmm0 ; SSE2-NEXT: minps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: minps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -194,8 +188,8 @@ ; SSE41-NEXT: minps %xmm3, %xmm1 ; SSE41-NEXT: minps %xmm2, %xmm0 ; SSE41-NEXT: minps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: minps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: minss %xmm1, %xmm0 @@ -236,8 +230,7 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -260,8 +253,8 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -292,8 +285,8 @@ ; SSE-NEXT: minpd %xmm3, %xmm1 ; SSE-NEXT: minpd %xmm2, %xmm0 ; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -331,8 +324,8 @@ ; SSE-NEXT: minpd %xmm5, %xmm1 ; SSE-NEXT: minpd %xmm3, %xmm1 ; SSE-NEXT: minpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: minsd %xmm1, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -69,7 +69,7 @@ define float @test_v4f32(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: movaps %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] ; SSE2-NEXT: movaps %xmm0, %xmm1 @@ -78,7 +78,6 @@ ; SSE2-NEXT: andps %xmm3, %xmm4 ; SSE2-NEXT: minss %xmm0, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE2-NEXT: andnps %xmm3, %xmm1 ; SSE2-NEXT: orps %xmm4, %xmm1 ; SSE2-NEXT: movaps %xmm2, %xmm3 @@ -100,7 +99,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; SSE41-NEXT: movaps %xmm0, %xmm1 ; SSE41-NEXT: cmpunordss %xmm0, %xmm1 @@ -108,7 +107,6 @@ ; SSE41-NEXT: andps %xmm3, %xmm4 ; SSE41-NEXT: minss %xmm0, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE41-NEXT: andnps %xmm3, %xmm1 ; SSE41-NEXT: orps %xmm4, %xmm1 ; SSE41-NEXT: movaps %xmm2, %xmm3 @@ -213,8 +211,8 @@ ; SSE41-NEXT: minss %xmm2, %xmm1 ; SSE41-NEXT: andnps %xmm1, %xmm0 ; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: movaps %xmm1, %xmm3 ; SSE41-NEXT: minss %xmm0, %xmm3 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 @@ -406,8 +404,8 @@ ; SSE41-NEXT: minss %xmm1, %xmm2 ; SSE41-NEXT: andnps %xmm2, %xmm0 ; SSE41-NEXT: orps %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movaps %xmm2, %xmm3 ; SSE41-NEXT: minss %xmm0, %xmm3 ; SSE41-NEXT: cmpunordss %xmm0, %xmm0 @@ -601,9 +599,8 @@ define double @test_v2f64(<2 x double> %a0) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] +; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: cmpunordsd %xmm0, %xmm1 ; SSE-NEXT: movapd %xmm1, %xmm3 ; SSE-NEXT: andpd %xmm2, %xmm3 @@ -663,8 +660,8 @@ ; SSE41-NEXT: minpd %xmm0, %xmm1 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 @@ -729,8 +726,8 @@ ; SSE41-NEXT: minpd %xmm0, %xmm2 ; SSE41-NEXT: cmpunordpd %xmm0, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm2, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 @@ -825,8 +822,8 @@ ; SSE41-NEXT: cmpunordpd %xmm4, %xmm4 ; SSE41-NEXT: movapd %xmm4, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 @@ -1018,8 +1015,8 @@ ; SSE41-NEXT: cmpunordpd %xmm2, %xmm2 ; SSE41-NEXT: movapd %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: movapd %xmm1, %xmm0 ; SSE41-NEXT: cmpunordsd %xmm1, %xmm0 ; SSE41-NEXT: movapd %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul-fast.ll @@ -46,8 +46,7 @@ define float @test_v4f32(float %a0, <4 x float> %a1) { ; SSE2-LABEL: test_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -57,8 +56,7 @@ ; ; SSE41-LABEL: test_v4f32: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm1 @@ -90,8 +88,8 @@ ; SSE2-LABEL: test_v8f32: ; SSE2: # %bb.0: ; SSE2-NEXT: mulps %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -102,8 +100,8 @@ ; SSE41-LABEL: test_v8f32: ; SSE41: # %bb.0: ; SSE41-NEXT: mulps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm1 @@ -143,8 +141,8 @@ ; SSE2-NEXT: mulps %xmm4, %xmm2 ; SSE2-NEXT: mulps %xmm3, %xmm1 ; SSE2-NEXT: mulps %xmm2, %xmm1 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulps %xmm1, %xmm2 ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] @@ -157,8 +155,8 @@ ; SSE41-NEXT: mulps %xmm4, %xmm2 ; SSE41-NEXT: mulps %xmm3, %xmm1 ; SSE41-NEXT: mulps %xmm2, %xmm1 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulps %xmm1, %xmm2 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm1 @@ -231,8 +229,7 @@ define float @test_v4f32_zero(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -241,8 +238,7 @@ ; ; SSE41-LABEL: test_v4f32_zero: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -271,8 +267,8 @@ ; SSE2-LABEL: test_v8f32_zero: ; SSE2: # %bb.0: ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -282,8 +278,8 @@ ; SSE41-LABEL: test_v8f32_zero: ; SSE41: # %bb.0: ; SSE41-NEXT: mulps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -320,8 +316,8 @@ ; SSE2-NEXT: mulps %xmm3, %xmm1 ; SSE2-NEXT: mulps %xmm2, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -333,8 +329,8 @@ ; SSE41-NEXT: mulps %xmm3, %xmm1 ; SSE41-NEXT: mulps %xmm2, %xmm0 ; SSE41-NEXT: mulps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -404,8 +400,7 @@ define float @test_v4f32_undef(<4 x float> %a0) { ; SSE2-LABEL: test_v4f32_undef: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -414,8 +409,7 @@ ; ; SSE41-LABEL: test_v4f32_undef: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -444,8 +438,8 @@ ; SSE2-LABEL: test_v8f32_undef: ; SSE2: # %bb.0: ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -455,8 +449,8 @@ ; SSE41-LABEL: test_v8f32_undef: ; SSE41: # %bb.0: ; SSE41-NEXT: mulps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -493,8 +487,8 @@ ; SSE2-NEXT: mulps %xmm3, %xmm1 ; SSE2-NEXT: mulps %xmm2, %xmm0 ; SSE2-NEXT: mulps %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE2-NEXT: mulps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] @@ -506,8 +500,8 @@ ; SSE41-NEXT: mulps %xmm3, %xmm1 ; SSE41-NEXT: mulps %xmm2, %xmm0 ; SSE41-NEXT: mulps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE41-NEXT: mulps %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -548,8 +542,7 @@ define double @test_v2f64(double %a0, <2 x double> %a1) { ; SSE-LABEL: test_v2f64: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: mulsd %xmm1, %xmm2 ; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -575,8 +568,8 @@ ; SSE-LABEL: test_v4f64: ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: mulsd %xmm1, %xmm2 ; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -610,8 +603,8 @@ ; SSE-NEXT: mulpd %xmm4, %xmm2 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE-NEXT: mulsd %xmm1, %xmm2 ; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: retq @@ -648,12 +641,12 @@ ; SSE-NEXT: mulpd %xmm6, %xmm2 ; SSE-NEXT: mulpd %xmm7, %xmm3 ; SSE-NEXT: mulpd %xmm5, %xmm1 -; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: mulpd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm2, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE-NEXT: mulsd %xmm4, %xmm1 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq @@ -694,8 +687,7 @@ define double @test_v2f64_zero(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_zero: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -718,8 +710,8 @@ ; SSE-LABEL: test_v4f64_zero: ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -750,8 +742,8 @@ ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -789,8 +781,8 @@ ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -828,8 +820,7 @@ define double @test_v2f64_undef(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_undef: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -852,8 +843,8 @@ ; SSE-LABEL: test_v4f64_undef: ; SSE: # %bb.0: ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -884,8 +875,8 @@ ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm2, %xmm0 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -923,8 +914,8 @@ ; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: mulpd %xmm1, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -49,8 +49,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -61,8 +61,8 @@ ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -100,8 +100,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm3, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE2-NEXT: xorps %xmm3, %xmm3 +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE2-NEXT: mulss %xmm3, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -109,8 +109,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -121,16 +121,16 @@ ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 @@ -186,8 +186,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm5, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSE2-NEXT: mulss %xmm5, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -195,8 +195,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -204,8 +204,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: mulss %xmm3, %xmm0 @@ -213,8 +213,8 @@ ; SSE2-NEXT: movaps %xmm4, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm4, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE2-NEXT: mulss %xmm4, %xmm0 @@ -225,32 +225,32 @@ ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm5, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] +; SSE41-NEXT: xorps %xmm5, %xmm5 +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1] ; SSE41-NEXT: mulss %xmm5, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: mulss %xmm4, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm4, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; SSE41-NEXT: mulss %xmm4, %xmm0 @@ -369,8 +369,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: mulss %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -380,8 +379,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: mulss %xmm1, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 @@ -416,8 +414,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm0, %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE2-NEXT: mulss %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm3, %xmm0 @@ -425,8 +422,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -436,16 +433,15 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm0, %xmm2 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE41-NEXT: mulss %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -498,8 +494,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] ; SSE2-NEXT: mulss %xmm0, %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE2-NEXT: mulss %xmm4, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm5, %xmm0 @@ -507,8 +502,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE2-NEXT: mulss %xmm4, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -516,8 +511,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -525,8 +520,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: mulss %xmm3, %xmm0 @@ -536,32 +531,31 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss %xmm0, %xmm4 -; SSE41-NEXT: movaps %xmm0, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE41-NEXT: mulss %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm5, %xmm0 ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE41-NEXT: mulss %xmm4, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 @@ -677,8 +671,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE2-NEXT: mulss %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -688,8 +681,7 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE41-NEXT: movaps %xmm0, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE41-NEXT: mulss %xmm1, %xmm2 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 @@ -724,8 +716,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1] ; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE2-NEXT: mulss %xmm2, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm3, %xmm0 @@ -733,8 +724,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm2 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE2-NEXT: mulss %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -744,16 +735,15 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1] ; SSE41-NEXT: mulss %xmm2, %xmm3 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm2 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 @@ -806,8 +796,7 @@ ; SSE2-NEXT: movaps %xmm0, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1] ; SSE2-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE2-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE2-NEXT: mulss %xmm4, %xmm5 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: mulss %xmm5, %xmm0 @@ -815,8 +804,8 @@ ; SSE2-NEXT: movaps %xmm1, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1] ; SSE2-NEXT: mulss %xmm4, %xmm0 -; SSE2-NEXT: movaps %xmm1, %xmm4 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE2-NEXT: mulss %xmm4, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE2-NEXT: mulss %xmm1, %xmm0 @@ -824,8 +813,8 @@ ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm2, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE2-NEXT: mulss %xmm2, %xmm0 @@ -833,8 +822,8 @@ ; SSE2-NEXT: movaps %xmm3, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[1,1] ; SSE2-NEXT: mulss %xmm1, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE2-NEXT: mulss %xmm1, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE2-NEXT: mulss %xmm3, %xmm0 @@ -844,32 +833,31 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE41-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE41-NEXT: movaps %xmm0, %xmm5 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1] +; SSE41-NEXT: movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1] ; SSE41-NEXT: mulss %xmm4, %xmm5 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE41-NEXT: mulss %xmm5, %xmm0 ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; SSE41-NEXT: mulss %xmm4, %xmm0 -; SSE41-NEXT: movaps %xmm1, %xmm4 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1] +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1] ; SSE41-NEXT: mulss %xmm4, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm2, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; SSE41-NEXT: mulss %xmm2, %xmm0 ; SSE41-NEXT: mulss %xmm3, %xmm0 ; SSE41-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; SSE41-NEXT: mulss %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] ; SSE41-NEXT: mulss %xmm1, %xmm0 ; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3] ; SSE41-NEXT: mulss %xmm3, %xmm0 @@ -983,8 +971,9 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v4f64: @@ -1021,14 +1010,17 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: mulsd %xmm4, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64: @@ -1081,26 +1073,33 @@ ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm2, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE2-NEXT: mulsd %xmm2, %xmm0 -; SSE2-NEXT: mulsd %xmm3, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm3, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm4, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE2-NEXT: mulsd %xmm4, %xmm0 -; SSE2-NEXT: mulsd %xmm5, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm5, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm6, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE2-NEXT: mulsd %xmm6, %xmm0 -; SSE2-NEXT: mulsd %xmm7, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm7, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: mulsd %xmm8, %xmm0 -; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE2-NEXT: mulsd %xmm8, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm8[1],xmm1[1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v16f64: @@ -1109,23 +1108,29 @@ ; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm2, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE41-NEXT: mulsd %xmm2, %xmm0 -; SSE41-NEXT: mulsd %xmm3, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm3, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm4, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE41-NEXT: mulsd %xmm4, %xmm0 -; SSE41-NEXT: mulsd %xmm5, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm5, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm6, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE41-NEXT: mulsd %xmm6, %xmm0 -; SSE41-NEXT: mulsd %xmm7, %xmm0 -; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd %xmm7, %xmm0 +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 ; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 ; SSE41-NEXT: retq @@ -1208,8 +1213,7 @@ define double @test_v2f64_one(<2 x double> %a0) { ; SSE-LABEL: test_v2f64_one: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1231,8 +1235,7 @@ define double @test_v4f64_one(<4 x double> %a0) { ; SSE-LABEL: test_v4f64_one: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1] ; SSE-NEXT: mulsd %xmm2, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] @@ -1267,18 +1270,19 @@ define double @test_v8f64_one(<8 x double> %a0) { ; SSE-LABEL: test_v8f64_one: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm4 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1] ; SSE-NEXT: mulsd %xmm4, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_one: @@ -1324,30 +1328,35 @@ define double @test_v16f64_one(<16 x double> %a0) { ; SSE-LABEL: test_v16f64_one: ; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm8 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm0[1] +; SSE-NEXT: movhlps {{.*#+}} xmm8 = xmm0[1],xmm8[1] ; SSE-NEXT: mulsd %xmm8, %xmm0 ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: mulsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: mulsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm7, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_one: @@ -1489,11 +1498,13 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v8f64_undef: @@ -1545,23 +1556,29 @@ ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm3, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: mulsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm5, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm5[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: mulsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm6[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: mulsd %xmm7, %xmm0 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm7[1],xmm1[1] +; SSE-NEXT: mulsd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test_v16f64_undef: diff --git a/llvm/test/CodeGen/X86/vector-rem.ll b/llvm/test/CodeGen/X86/vector-rem.ll --- a/llvm/test/CodeGen/X86/vector-rem.ll +++ b/llvm/test/CodeGen/X86/vector-rem.ll @@ -82,31 +82,31 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; CHECK-NEXT: callq fmodf@PLT -; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: addq $72, %rsp diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -306,8 +306,7 @@ define <2 x double> @shuffle_v2f64_3u(<2 x double> %a, <2 x double> %b) { ; SSE-LABEL: shuffle_v2f64_3u: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v2f64_3u: diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll --- a/llvm/test/CodeGen/X86/widen_conv-3.ll +++ b/llvm/test/CodeGen/X86/widen_conv-3.ll @@ -65,8 +65,8 @@ ; X86-SSE2-NEXT: psrad $24, %xmm0 ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) -; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE2-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax) diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll --- a/llvm/test/CodeGen/X86/widen_conv-4.ll +++ b/llvm/test/CodeGen/X86/widen_conv-4.ll @@ -18,8 +18,8 @@ ; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE2-NEXT: movups %xmm0, (%eax) ; X86-SSE2-NEXT: movss %xmm2, 16(%eax) -; X86-SSE2-NEXT: movaps %xmm2, %xmm0 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; X86-SSE2-NEXT: xorps %xmm0, %xmm0 +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm2[1],xmm0[1] ; X86-SSE2-NEXT: movss %xmm0, 24(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; X86-SSE2-NEXT: movss %xmm2, 20(%eax) @@ -91,8 +91,8 @@ ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X86-SSE2-NEXT: cvtdq2ps %xmm1, %xmm0 ; X86-SSE2-NEXT: movss %xmm0, (%eax) -; X86-SSE2-NEXT: movaps %xmm0, %xmm1 -; X86-SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; X86-SSE2-NEXT: xorps %xmm1, %xmm1 +; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; X86-SSE2-NEXT: movss %xmm1, 8(%eax) ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-SSE2-NEXT: movss %xmm0, 4(%eax)